1 /*
   2  * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "gc/shared/barrierSet.hpp"
  28 #include "gc/shared/barrierSetAssembler.hpp"
  29 #include "oops/methodData.hpp"
  30 #include "opto/c2_MacroAssembler.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/opcodes.hpp"
  34 #include "opto/subnode.hpp"
  35 #include "runtime/globals.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 #include "utilities/checkedCast.hpp"
  39 #include "utilities/globalDefinitions.hpp"
  40 #include "utilities/powerOfTwo.hpp"
  41 #include "utilities/sizes.hpp"
  42 
  43 #ifdef PRODUCT
  44 #define BLOCK_COMMENT(str) /* nothing */
  45 #define STOP(error) stop(error)
  46 #else
  47 #define BLOCK_COMMENT(str) block_comment(str)
  48 #define STOP(error) block_comment(error); stop(error)
  49 #endif
  50 
  51 // C2 compiled method's prolog code.
  52 // Beware! This sp_inc is NOT the same as the one mentioned in MacroAssembler::remove_frame but only the size
  53 // of the extension space + the additional copy of the return address. That means, it doesn't contain the
  54 // frame size (where the local and sp_inc are) and the saved RBP.
  55 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) {
  56   if (C->clinit_barrier_on_entry()) {
  57     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
  58     assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
  59 
  60     Label L_skip_barrier;
  61     Register klass = rscratch1;
  62 
  63     mov_metadata(klass, C->method()->holder()->constant_encoding());
  64     clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
  65 
  66     jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
  67 
  68     bind(L_skip_barrier);
  69   }
  70 
  71   int framesize = C->output()->frame_size_in_bytes();
  72   int bangsize = C->output()->bang_size_in_bytes();
  73   bool fp_mode_24b = false;
  74   int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0;
  75 
  76   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  77 
  78   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  79   // Remove word for return addr
  80   framesize -= wordSize;
  81   stack_bang_size -= wordSize;
  82 
  83   // Calls to C2R adapters often do not accept exceptional returns.
  84   // We require that their callers must bang for them.  But be careful, because
  85   // some VM calls (such as call site linkage) can use several kilobytes of
  86   // stack.  But the stack safety zone should account for that.
  87   // See bugs 4446381, 4468289, 4497237.
  88   if (stack_bang_size > 0) {
  89     generate_stack_overflow_check(stack_bang_size);
  90 
  91     // We always push rbp, so that on return to interpreter rbp, will be
  92     // restored correctly and we can correct the stack.
  93     push(rbp);
  94 #ifdef ASSERT
  95     if (sp_inc > 0) {
  96       movl(Address(rsp, 0), badRegWordVal);
  97       movl(Address(rsp, VMRegImpl::stack_slot_size), badRegWordVal);
  98     }
  99 #endif
 100     // Save caller's stack pointer into RBP if the frame pointer is preserved.
 101     if (PreserveFramePointer) {
 102       mov(rbp, rsp);
 103     }
 104     // Remove word for ebp
 105     framesize -= wordSize;
 106 
 107     // Create frame
 108     if (framesize) {
 109       subptr(rsp, framesize);
 110     }
 111   } else {
 112     subptr(rsp, framesize);
 113 
 114     // Save RBP register now.
 115     framesize -= wordSize;
 116     movptr(Address(rsp, framesize), rbp);
 117 #ifdef ASSERT
 118     if (sp_inc > 0) {
 119       movl(Address(rsp, framesize), badRegWordVal);
 120       movl(Address(rsp, framesize + VMRegImpl::stack_slot_size), badRegWordVal);
 121     }
 122 #endif
 123     // Save caller's stack pointer into RBP if the frame pointer is preserved.
 124     if (PreserveFramePointer) {
 125       movptr(rbp, rsp);
 126       if (framesize > 0) {
 127         addptr(rbp, framesize);
 128       }
 129     }
 130   }
 131 
 132   if (C->needs_stack_repair()) {
 133     // Save stack increment just below the saved rbp (also account for fixed framesize and rbp)
 134     assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned");
 135     movptr(Address(rsp, framesize - wordSize), sp_inc + framesize);
 136   }
 137 
 138   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 139     framesize -= wordSize;
 140     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 141   }
 142 
 143 #ifdef ASSERT
 144   if (VerifyStackAtCalls) {
 145     Label L;
 146     push(rax);
 147     mov(rax, rsp);
 148     andptr(rax, StackAlignmentInBytes-1);
 149     cmpptr(rax, StackAlignmentInBytes-wordSize);
 150     pop(rax);
 151     jcc(Assembler::equal, L);
 152     STOP("Stack is not properly aligned!");
 153     bind(L);
 154   }
 155 #endif
 156 }
 157 
 158 void C2_MacroAssembler::entry_barrier() {
 159   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 160   // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 161   Label dummy_slow_path;
 162   Label dummy_continuation;
 163   Label* slow_path = &dummy_slow_path;
 164   Label* continuation = &dummy_continuation;
 165   if (!Compile::current()->output()->in_scratch_emit_size()) {
 166     // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 167     C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 168     Compile::current()->output()->add_stub(stub);
 169     slow_path = &stub->entry();
 170     continuation = &stub->continuation();


 171   }
 172   bs->nmethod_entry_barrier(this, slow_path, continuation);
 173 }
 174 
 175 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 176   switch (vlen_in_bytes) {
 177     case  4: // fall-through
 178     case  8: // fall-through
 179     case 16: return Assembler::AVX_128bit;
 180     case 32: return Assembler::AVX_256bit;
 181     case 64: return Assembler::AVX_512bit;
 182 
 183     default: {
 184       ShouldNotReachHere();
 185       return Assembler::AVX_NoVec;
 186     }
 187   }
 188 }
 189 
 190 // fast_lock and fast_unlock used by C2
 191 
 192 // Because the transitions from emitted code to the runtime
 193 // monitorenter/exit helper stubs are so slow it's critical that
 194 // we inline both the stack-locking fast path and the inflated fast path.
 195 //
 196 // See also: cmpFastLock and cmpFastUnlock.
 197 //
 198 // What follows is a specialized inline transliteration of the code
 199 // in enter() and exit(). If we're concerned about I$ bloat another
 200 // option would be to emit TrySlowEnter and TrySlowExit methods
 201 // at startup-time.  These methods would accept arguments as
 202 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 203 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 204 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 205 // In practice, however, the # of lock sites is bounded and is usually small.
 206 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 207 // if the processor uses simple bimodal branch predictors keyed by EIP
 208 // Since the helper routines would be called from multiple synchronization
 209 // sites.
 210 //
 211 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 212 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 213 // to those specialized methods.  That'd give us a mostly platform-independent
 214 // implementation that the JITs could optimize and inline at their pleasure.
 215 // Done correctly, the only time we'd need to cross to native could would be
 216 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 217 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 218 // (b) explicit barriers or fence operations.
 219 //
 220 // TODO:
 221 //
 222 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 223 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 224 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 225 //    the lock operators would typically be faster than reifying Self.
 226 //
 227 // *  Ideally I'd define the primitives as:
 228 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 229 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 230 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 231 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 232 //    Furthermore the register assignments are overconstrained, possibly resulting in
 233 //    sub-optimal code near the synchronization site.
 234 //
 235 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 236 //    Alternately, use a better sp-proximity test.
 237 //
 238 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 239 //    Either one is sufficient to uniquely identify a thread.
 240 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 241 //
 242 // *  Intrinsify notify() and notifyAll() for the common cases where the
 243 //    object is locked by the calling thread but the waitlist is empty.
 244 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 245 //
 246 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 247 //    But beware of excessive branch density on AMD Opterons.
 248 //
 249 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 250 //    or failure of the fast path.  If the fast path fails then we pass
 251 //    control to the slow path, typically in C.  In fast_lock and
 252 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 253 //    will emit a conditional branch immediately after the node.
 254 //    So we have branches to branches and lots of ICC.ZF games.
 255 //    Instead, it might be better to have C2 pass a "FailureLabel"
 256 //    into fast_lock and fast_unlock.  In the case of success, control
 257 //    will drop through the node.  ICC.ZF is undefined at exit.
 258 //    In the case of failure, the node will branch directly to the
 259 //    FailureLabel
 260 
 261 
 262 // obj: object to lock
 263 // box: on-stack box address -- KILLED
 264 // rax: tmp -- KILLED
 265 // t  : tmp -- KILLED
 266 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register rax_reg,
 267                                   Register t, Register thread) {
 268   assert(rax_reg == rax, "Used for CAS");
 269   assert_different_registers(obj, box, rax_reg, t, thread);
 270 
 271   // Handle inflated monitor.
 272   Label inflated;
 273   // Finish fast lock successfully. ZF value is irrelevant.
 274   Label locked;
 275   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 276   Label slow_path;
 277 
 278   if (UseObjectMonitorTable) {
 279     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 280     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 281   }
 282 
 283   if (DiagnoseSyncOnValueBasedClasses != 0) {
 284     load_klass(rax_reg, obj, t);
 285     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 286     jcc(Assembler::notZero, slow_path);
 287   }
 288 
 289   const Register mark = t;
 290 
 291   { // Fast Lock
 292 
 293     Label push;
 294 
 295     const Register top = UseObjectMonitorTable ? rax_reg : box;
 296 
 297     // Load the mark.
 298     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 299 
 300     // Prefetch top.
 301     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 302 
 303     // Check for monitor (0b10).
 304     testptr(mark, markWord::monitor_value);
 305     jcc(Assembler::notZero, inflated);
 306 
 307     // Check if lock-stack is full.
 308     cmpl(top, LockStack::end_offset() - 1);
 309     jcc(Assembler::greater, slow_path);
 310 
 311     // Check if recursive.
 312     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 313     jccb(Assembler::equal, push);
 314 
 315     // Try to lock. Transition lock bits 0b01 => 0b00
 316     movptr(rax_reg, mark);
 317     orptr(rax_reg, markWord::unlocked_value);
 318     andptr(mark, ~(int32_t)markWord::unlocked_value);
 319     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 320     jcc(Assembler::notEqual, slow_path);
 321 
 322     if (UseObjectMonitorTable) {
 323       // Need to reload top, clobbered by CAS.
 324       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 325     }
 326     bind(push);
 327     // After successful lock, push object on lock-stack.
 328     movptr(Address(thread, top), obj);
 329     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 330     jmpb(locked);
 331   }
 332 
 333   { // Handle inflated monitor.
 334     bind(inflated);
 335 
 336     const Register monitor = t;
 337 
 338     if (!UseObjectMonitorTable) {
 339       assert(mark == monitor, "should be the same here");
 340     } else {
 341       // Uses ObjectMonitorTable.  Look for the monitor in the om_cache.
 342       // Fetch ObjectMonitor* from the cache or take the slow-path.
 343       Label monitor_found;
 344 
 345       // Load cache address
 346       lea(t, Address(thread, JavaThread::om_cache_oops_offset()));
 347 
 348       const int num_unrolled = 2;
 349       for (int i = 0; i < num_unrolled; i++) {
 350         cmpptr(obj, Address(t));
 351         jccb(Assembler::equal, monitor_found);
 352         increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 353       }
 354 
 355       Label loop;
 356 
 357       // Search for obj in cache.
 358       bind(loop);
 359 
 360       // Check for match.
 361       cmpptr(obj, Address(t));
 362       jccb(Assembler::equal, monitor_found);
 363 
 364       // Search until null encountered, guaranteed _null_sentinel at end.
 365       cmpptr(Address(t), 1);
 366       jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0
 367       increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 368       jmpb(loop);
 369 
 370       // Cache hit.
 371       bind(monitor_found);
 372       movptr(monitor, Address(t, OMCache::oop_to_monitor_difference()));
 373     }
 374     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 375     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 376     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 377 
 378     Label monitor_locked;
 379     // Lock the monitor.
 380 
 381     if (UseObjectMonitorTable) {
 382       // Cache the monitor for unlock before trashing box. On failure to acquire
 383       // the lock, the slow path will reset the entry accordingly (see CacheSetter).
 384       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 385     }
 386 
 387     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 388     xorptr(rax_reg, rax_reg);
 389     movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
 390     lock(); cmpxchgptr(box, owner_address);
 391     jccb(Assembler::equal, monitor_locked);
 392 
 393     // Check if recursive.
 394     cmpptr(box, rax_reg);
 395     jccb(Assembler::notEqual, slow_path);
 396 
 397     // Recursive.
 398     increment(recursions_address);
 399 
 400     bind(monitor_locked);
 401   }
 402 
 403   bind(locked);
 404   // Set ZF = 1
 405   xorl(rax_reg, rax_reg);
 406 
 407 #ifdef ASSERT
 408   // Check that locked label is reached with ZF set.
 409   Label zf_correct;
 410   Label zf_bad_zero;
 411   jcc(Assembler::zero, zf_correct);
 412   jmp(zf_bad_zero);
 413 #endif
 414 
 415   bind(slow_path);
 416 #ifdef ASSERT
 417   // Check that slow_path label is reached with ZF not set.
 418   jcc(Assembler::notZero, zf_correct);
 419   stop("Fast Lock ZF != 0");
 420   bind(zf_bad_zero);
 421   stop("Fast Lock ZF != 1");
 422   bind(zf_correct);
 423 #endif
 424   // C2 uses the value of ZF to determine the continuation.
 425 }
 426 
 427 // obj: object to lock
 428 // rax: tmp -- KILLED
 429 // t  : tmp - cannot be obj nor rax -- KILLED
 430 //
 431 // Some commentary on balanced locking:
 432 //
 433 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 434 // Methods that don't have provably balanced locking are forced to run in the
 435 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 436 // The interpreter provides two properties:
 437 // I1:  At return-time the interpreter automatically and quietly unlocks any
 438 //      objects acquired in the current activation (frame).  Recall that the
 439 //      interpreter maintains an on-stack list of locks currently held by
 440 //      a frame.
 441 // I2:  If a method attempts to unlock an object that is not held by the
 442 //      frame the interpreter throws IMSX.
 443 //
 444 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 445 // B() doesn't have provably balanced locking so it runs in the interpreter.
 446 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 447 // is still locked by A().
 448 //
 449 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface
 450 // Specification" states that an object locked by JNI's MonitorEnter should not be
 451 // unlocked by "normal" java-level locking and vice-versa.  The specification doesn't
 452 // specify what will occur if a program engages in such mixed-mode locking, however.
 453 // Arguably given that the spec legislates the JNI case as undefined our implementation
 454 // could reasonably *avoid* checking owner in fast_unlock().
 455 // In the interest of performance we elide m->Owner==Self check in unlock.
 456 // A perfectly viable alternative is to elide the owner check except when
 457 // Xcheck:jni is enabled.
 458 
 459 void C2_MacroAssembler::fast_unlock(Register obj, Register reg_rax, Register t, Register thread) {
 460   assert(reg_rax == rax, "Used for CAS");
 461   assert_different_registers(obj, reg_rax, t);
 462 
 463   // Handle inflated monitor.
 464   Label inflated, inflated_check_lock_stack;
 465   // Finish fast unlock successfully.  MUST jump with ZF == 1
 466   Label unlocked, slow_path;
 467 
 468   const Register mark = t;
 469   const Register monitor = t;
 470   const Register top = UseObjectMonitorTable ? t : reg_rax;
 471   const Register box = reg_rax;
 472 
 473   Label dummy;
 474   C2FastUnlockStub* stub = nullptr;
 475 
 476   if (!Compile::current()->output()->in_scratch_emit_size()) {
 477     stub = new (Compile::current()->comp_arena()) C2FastUnlockStub(obj, mark, reg_rax, thread);
 478     Compile::current()->output()->add_stub(stub);
 479   }
 480 
 481   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 482 
 483   { // Fast Unlock
 484 
 485     // Load top.
 486     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 487 
 488     if (!UseObjectMonitorTable) {
 489       // Prefetch mark.
 490       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 491     }
 492 
 493     // Check if obj is top of lock-stack.
 494     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 495     // Top of lock stack was not obj. Must be monitor.
 496     jcc(Assembler::notEqual, inflated_check_lock_stack);
 497 
 498     // Pop lock-stack.
 499     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 500     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 501 
 502     // Check if recursive.
 503     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 504     jcc(Assembler::equal, unlocked);
 505 
 506     // We elide the monitor check, let the CAS fail instead.
 507 
 508     if (UseObjectMonitorTable) {
 509       // Load mark.
 510       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 511     }
 512 
 513     // Try to unlock. Transition lock bits 0b00 => 0b01
 514     movptr(reg_rax, mark);
 515     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 516     orptr(mark, markWord::unlocked_value);
 517     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 518     jcc(Assembler::notEqual, push_and_slow_path);
 519     jmp(unlocked);
 520   }
 521 
 522 
 523   { // Handle inflated monitor.
 524     bind(inflated_check_lock_stack);
 525 #ifdef ASSERT
 526     Label check_done;
 527     subl(top, oopSize);
 528     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 529     jcc(Assembler::below, check_done);
 530     cmpptr(obj, Address(thread, top));
 531     jccb(Assembler::notEqual, inflated_check_lock_stack);
 532     stop("Fast Unlock lock on stack");
 533     bind(check_done);
 534     if (UseObjectMonitorTable) {
 535       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 536     }
 537     testptr(mark, markWord::monitor_value);
 538     jccb(Assembler::notZero, inflated);
 539     stop("Fast Unlock not monitor");
 540 #endif
 541 
 542     bind(inflated);
 543 
 544     if (!UseObjectMonitorTable) {
 545       assert(mark == monitor, "should be the same here");
 546     } else {
 547       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 548       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 549       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 550       cmpptr(monitor, alignof(ObjectMonitor*));
 551       jcc(Assembler::below, slow_path);
 552     }
 553     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 554     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 555     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 556     const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
 557     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 558 
 559     Label recursive;
 560 
 561     // Check if recursive.
 562     cmpptr(recursions_address, 0);
 563     jccb(Assembler::notZero, recursive);
 564 
 565     // Set owner to null.
 566     // Release to satisfy the JMM
 567     movptr(owner_address, NULL_WORD);
 568     // We need a full fence after clearing owner to avoid stranding.
 569     // StoreLoad achieves this.
 570     membar(StoreLoad);
 571 
 572     // Check if the entry_list is empty.
 573     cmpptr(entry_list_address, NULL_WORD);
 574     jccb(Assembler::zero, unlocked);    // If so we are done.
 575 
 576     // Check if there is a successor.
 577     cmpptr(succ_address, NULL_WORD);
 578     jccb(Assembler::notZero, unlocked); // If so we are done.
 579 
 580     // Save the monitor pointer in the current thread, so we can try to
 581     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 582     if (!UseObjectMonitorTable) {
 583       andptr(monitor, ~(int32_t)markWord::monitor_value);
 584     }
 585     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 586 
 587     orl(t, 1); // Fast Unlock ZF = 0
 588     jmpb(slow_path);
 589 
 590     // Recursive unlock.
 591     bind(recursive);
 592     decrement(recursions_address);
 593   }
 594 
 595   bind(unlocked);
 596   xorl(t, t); // Fast Unlock ZF = 1
 597 
 598 #ifdef ASSERT
 599   // Check that unlocked label is reached with ZF set.
 600   Label zf_correct;
 601   Label zf_bad_zero;
 602   jcc(Assembler::zero, zf_correct);
 603   jmp(zf_bad_zero);
 604 #endif
 605 
 606   bind(slow_path);
 607   if (stub != nullptr) {
 608     bind(stub->slow_path_continuation());
 609   }
 610 #ifdef ASSERT
 611   // Check that stub->continuation() label is reached with ZF not set.
 612   jcc(Assembler::notZero, zf_correct);
 613   stop("Fast Unlock ZF != 0");
 614   bind(zf_bad_zero);
 615   stop("Fast Unlock ZF != 1");
 616   bind(zf_correct);
 617 #endif
 618   // C2 uses the value of ZF to determine the continuation.
 619 }
 620 
 621 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
 622   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
 623 }
 624 
 625 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) {
 626   const int framesize = Compile::current()->output()->frame_size_in_bytes();
 627   masm->movptr(dst, rsp);
 628   if (framesize > 2 * wordSize) {
 629     masm->addptr(dst, framesize - 2 * wordSize);
 630   }
 631 }
 632 
 633 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
 634   if (PreserveFramePointer) {
 635     // frame pointer is valid
 636 #ifdef ASSERT
 637     // Verify frame pointer value in rbp.
 638     reconstruct_frame_pointer_helper(this, rtmp);
 639     Label L_success;
 640     cmpq(rbp, rtmp);
 641     jccb(Assembler::equal, L_success);
 642     STOP("frame pointer mismatch");
 643     bind(L_success);
 644 #endif // ASSERT
 645   } else {
 646     reconstruct_frame_pointer_helper(this, rbp);
 647   }
 648 }
 649 
 650 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) {
 651   jint lo = t->_lo;
 652   jint hi = t->_hi;
 653   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi);
 654   if (t == TypeInt::INT) {
 655     return;
 656   }
 657 
 658   BLOCK_COMMENT("CastII {");
 659   Label fail;
 660   Label succeed;
 661 
 662   if (lo != min_jint) {
 663     cmpl(val, lo);
 664     jccb(Assembler::less, fail);
 665   }
 666   if (hi != max_jint) {
 667     cmpl(val, hi);
 668     jccb(Assembler::greater, fail);
 669   }
 670   jmpb(succeed);
 671 
 672   bind(fail);
 673   movl(c_rarg0, idx);
 674   movl(c_rarg1, val);
 675   movl(c_rarg2, lo);
 676   movl(c_rarg3, hi);
 677   reconstruct_frame_pointer(rscratch1);
 678   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range)));
 679   hlt();
 680   bind(succeed);
 681   BLOCK_COMMENT("} // CastII");
 682 }
 683 
 684 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
 685   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
 686 }
 687 
 688 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) {
 689   jlong lo = t->_lo;
 690   jlong hi = t->_hi;
 691   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi);
 692   if (t == TypeLong::LONG) {
 693     return;
 694   }
 695 
 696   BLOCK_COMMENT("CastLL {");
 697   Label fail;
 698   Label succeed;
 699 
 700   auto cmp_val = [&](jlong bound) {
 701     if (is_simm32(bound)) {
 702       cmpq(val, checked_cast<int>(bound));
 703     } else {
 704       mov64(tmp, bound);
 705       cmpq(val, tmp);
 706     }
 707   };
 708 
 709   if (lo != min_jlong) {
 710     cmp_val(lo);
 711     jccb(Assembler::less, fail);
 712   }
 713   if (hi != max_jlong) {
 714     cmp_val(hi);
 715     jccb(Assembler::greater, fail);
 716   }
 717   jmpb(succeed);
 718 
 719   bind(fail);
 720   movl(c_rarg0, idx);
 721   movq(c_rarg1, val);
 722   mov64(c_rarg2, lo);
 723   mov64(c_rarg3, hi);
 724   reconstruct_frame_pointer(rscratch1);
 725   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range)));
 726   hlt();
 727   bind(succeed);
 728   BLOCK_COMMENT("} // CastLL");
 729 }
 730 
 731 //-------------------------------------------------------------------------------------------
 732 // Generic instructions support for use in .ad files C2 code generation
 733 
 734 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 735   if (dst != src) {
 736     movdqu(dst, src);
 737   }
 738   if (opcode == Op_AbsVD) {
 739     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 740   } else {
 741     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 742     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 743   }
 744 }
 745 
 746 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 747   if (opcode == Op_AbsVD) {
 748     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 749   } else {
 750     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 751     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 752   }
 753 }
 754 
 755 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 756   if (dst != src) {
 757     movdqu(dst, src);
 758   }
 759   if (opcode == Op_AbsVF) {
 760     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 761   } else {
 762     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 763     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 764   }
 765 }
 766 
 767 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 768   if (opcode == Op_AbsVF) {
 769     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 770   } else {
 771     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 772     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 773   }
 774 }
 775 
 776 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 777   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 778   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 779 
 780   if (opcode == Op_MinV) {
 781     if (elem_bt == T_BYTE) {
 782       pminsb(dst, src);
 783     } else if (elem_bt == T_SHORT) {
 784       pminsw(dst, src);
 785     } else if (elem_bt == T_INT) {
 786       pminsd(dst, src);
 787     } else {
 788       assert(elem_bt == T_LONG, "required");
 789       assert(tmp == xmm0, "required");
 790       assert_different_registers(dst, src, tmp);
 791       movdqu(xmm0, dst);
 792       pcmpgtq(xmm0, src);
 793       blendvpd(dst, src);  // xmm0 as mask
 794     }
 795   } else { // opcode == Op_MaxV
 796     if (elem_bt == T_BYTE) {
 797       pmaxsb(dst, src);
 798     } else if (elem_bt == T_SHORT) {
 799       pmaxsw(dst, src);
 800     } else if (elem_bt == T_INT) {
 801       pmaxsd(dst, src);
 802     } else {
 803       assert(elem_bt == T_LONG, "required");
 804       assert(tmp == xmm0, "required");
 805       assert_different_registers(dst, src, tmp);
 806       movdqu(xmm0, src);
 807       pcmpgtq(xmm0, dst);
 808       blendvpd(dst, src);  // xmm0 as mask
 809     }
 810   }
 811 }
 812 
 813 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 814                                   XMMRegister src1, Address src2, int vlen_enc) {
 815   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 816   if (opcode == Op_UMinV) {
 817     switch(elem_bt) {
 818       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 819       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 820       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 821       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 822       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 823     }
 824   } else {
 825     assert(opcode == Op_UMaxV, "required");
 826     switch(elem_bt) {
 827       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 828       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 829       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 830       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 831       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 832     }
 833   }
 834 }
 835 
 836 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
 837   // For optimality, leverage a full vector width of 512 bits
 838   // for operations over smaller vector sizes on AVX512 targets.
 839   if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 840     if (opcode == Op_UMaxV) {
 841       evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 842     } else {
 843       assert(opcode == Op_UMinV, "required");
 844       evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 845     }
 846   } else {
 847     // T1 = -1
 848     vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
 849     // T1 = -1 << 63
 850     vpsllq(xtmp1, xtmp1, 63, vlen_enc);
 851     // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
 852     vpaddq(xtmp2, xtmp1, src2, vlen_enc);
 853     // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
 854     vpaddq(xtmp1, xtmp1, src1, vlen_enc);
 855     // Mask = T2 > T1
 856     vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
 857     if (opcode == Op_UMaxV) {
 858       // Res = Mask ? Src2 : Src1
 859       vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
 860     } else {
 861       // Res = Mask ? Src1 : Src2
 862       vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
 863     }
 864   }
 865 }
 866 
 867 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 868                                   XMMRegister src1, XMMRegister src2, int vlen_enc) {
 869   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 870   if (opcode == Op_UMinV) {
 871     switch(elem_bt) {
 872       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 873       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 874       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 875       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 876       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 877     }
 878   } else {
 879     assert(opcode == Op_UMaxV, "required");
 880     switch(elem_bt) {
 881       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 882       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 883       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 884       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 885       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 886     }
 887   }
 888 }
 889 
 890 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 891                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 892                                  int vlen_enc) {
 893   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 894 
 895   if (opcode == Op_MinV) {
 896     if (elem_bt == T_BYTE) {
 897       vpminsb(dst, src1, src2, vlen_enc);
 898     } else if (elem_bt == T_SHORT) {
 899       vpminsw(dst, src1, src2, vlen_enc);
 900     } else if (elem_bt == T_INT) {
 901       vpminsd(dst, src1, src2, vlen_enc);
 902     } else {
 903       assert(elem_bt == T_LONG, "required");
 904       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 905         vpminsq(dst, src1, src2, vlen_enc);
 906       } else {
 907         assert_different_registers(dst, src1, src2);
 908         vpcmpgtq(dst, src1, src2, vlen_enc);
 909         vblendvpd(dst, src1, src2, dst, vlen_enc);
 910       }
 911     }
 912   } else { // opcode == Op_MaxV
 913     if (elem_bt == T_BYTE) {
 914       vpmaxsb(dst, src1, src2, vlen_enc);
 915     } else if (elem_bt == T_SHORT) {
 916       vpmaxsw(dst, src1, src2, vlen_enc);
 917     } else if (elem_bt == T_INT) {
 918       vpmaxsd(dst, src1, src2, vlen_enc);
 919     } else {
 920       assert(elem_bt == T_LONG, "required");
 921       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 922         vpmaxsq(dst, src1, src2, vlen_enc);
 923       } else {
 924         assert_different_registers(dst, src1, src2);
 925         vpcmpgtq(dst, src1, src2, vlen_enc);
 926         vblendvpd(dst, src2, src1, dst, vlen_enc);
 927       }
 928     }
 929   }
 930 }
 931 
 932 // Float/Double min max
 933 
 934 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 935                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 936                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 937                                    int vlen_enc) {
 938   assert(UseAVX > 0, "required");
 939   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 940          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 941   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 942   assert_different_registers(a, tmp, atmp, btmp);
 943   assert_different_registers(b, tmp, atmp, btmp);
 944 
 945   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 946   bool is_double_word = is_double_word_type(elem_bt);
 947 
 948   /* Note on 'non-obvious' assembly sequence:
 949    *
 950    * While there are vminps/vmaxps instructions, there are two important differences between hardware
 951    * and Java on how they handle floats:
 952    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
 953    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
 954    *
 955    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
 956    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
 957    *                (only useful when signs differ, noop otherwise)
 958    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
 959 
 960    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
 961    *   btmp = (b < +0.0) ? a : b
 962    *   atmp = (b < +0.0) ? b : a
 963    *   Tmp  = Max_Float(atmp , btmp)
 964    *   Res  = (atmp == NaN) ? atmp : Tmp
 965    */
 966 
 967   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
 968   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
 969   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
 970   XMMRegister mask;
 971 
 972   if (!is_double_word && is_min) {
 973     mask = a;
 974     vblend = &MacroAssembler::vblendvps;
 975     vmaxmin = &MacroAssembler::vminps;
 976     vcmp = &MacroAssembler::vcmpps;
 977   } else if (!is_double_word && !is_min) {
 978     mask = b;
 979     vblend = &MacroAssembler::vblendvps;
 980     vmaxmin = &MacroAssembler::vmaxps;
 981     vcmp = &MacroAssembler::vcmpps;
 982   } else if (is_double_word && is_min) {
 983     mask = a;
 984     vblend = &MacroAssembler::vblendvpd;
 985     vmaxmin = &MacroAssembler::vminpd;
 986     vcmp = &MacroAssembler::vcmppd;
 987   } else {
 988     assert(is_double_word && !is_min, "sanity");
 989     mask = b;
 990     vblend = &MacroAssembler::vblendvpd;
 991     vmaxmin = &MacroAssembler::vmaxpd;
 992     vcmp = &MacroAssembler::vcmppd;
 993   }
 994 
 995   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
 996   XMMRegister maxmin, scratch;
 997   if (dst == btmp) {
 998     maxmin = btmp;
 999     scratch = tmp;
1000   } else {
1001     maxmin = tmp;
1002     scratch = btmp;
1003   }
1004 
1005   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1006   if (precompute_mask && !is_double_word) {
1007     vpsrad(tmp, mask, 32, vlen_enc);
1008     mask = tmp;
1009   } else if (precompute_mask && is_double_word) {
1010     vpxor(tmp, tmp, tmp, vlen_enc);
1011     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1012     mask = tmp;
1013   }
1014 
1015   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1016   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1017   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1018   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1019   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1020 }
1021 
1022 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1023                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1024                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1025                                     int vlen_enc) {
1026   assert(UseAVX > 2, "required");
1027   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1028          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1029   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1030   assert_different_registers(dst, a, atmp, btmp);
1031   assert_different_registers(dst, b, atmp, btmp);
1032 
1033   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1034   bool is_double_word = is_double_word_type(elem_bt);
1035   bool merge = true;
1036 
1037   if (!is_double_word && is_min) {
1038     evpmovd2m(ktmp, a, vlen_enc);
1039     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1040     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1041     vminps(dst, atmp, btmp, vlen_enc);
1042     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1043     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1044   } else if (!is_double_word && !is_min) {
1045     evpmovd2m(ktmp, b, vlen_enc);
1046     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1047     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1048     vmaxps(dst, atmp, btmp, vlen_enc);
1049     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1050     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1051   } else if (is_double_word && is_min) {
1052     evpmovq2m(ktmp, a, vlen_enc);
1053     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1054     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1055     vminpd(dst, atmp, btmp, vlen_enc);
1056     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1057     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1058   } else {
1059     assert(is_double_word && !is_min, "sanity");
1060     evpmovq2m(ktmp, b, vlen_enc);
1061     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1062     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1063     vmaxpd(dst, atmp, btmp, vlen_enc);
1064     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1065     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1066   }
1067 }
1068 
1069 void C2_MacroAssembler::vminmax_fp(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1070                                    XMMRegister src1, XMMRegister src2, int vlen_enc) {
1071   assert(opc == Op_MinV || opc == Op_MinReductionV ||
1072          opc == Op_MaxV || opc == Op_MaxReductionV, "sanity");
1073 
1074   int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN
1075                                                          : AVX10_2_MINMAX_MAX_COMPARE_SIGN;
1076   if (elem_bt == T_FLOAT) {
1077     evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc);
1078   } else {
1079     assert(elem_bt == T_DOUBLE, "");
1080     evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc);
1081   }
1082 }
1083 
1084 // Float/Double signum
1085 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1086   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1087 
1088   Label DONE_LABEL;
1089 
1090   // Handle special cases +0.0/-0.0 and NaN, if argument is +0.0/-0.0 or NaN, return argument
1091   // If AVX10.2 (or newer) floating point comparison instructions used, SF=1 for equal and unordered cases
1092   // If other floating point comparison instructions used, ZF=1 for equal and unordered cases
1093   if (opcode == Op_SignumF) {
1094     if (VM_Version::supports_avx10_2()) {
1095       vucomxss(dst, zero);
1096       jcc(Assembler::negative, DONE_LABEL);
1097     } else {
1098       ucomiss(dst, zero);
1099       jcc(Assembler::equal, DONE_LABEL);
1100     }
1101     movflt(dst, one);
1102     jcc(Assembler::above, DONE_LABEL);
1103     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1104   } else if (opcode == Op_SignumD) {
1105     if (VM_Version::supports_avx10_2()) {
1106       vucomxsd(dst, zero);
1107       jcc(Assembler::negative, DONE_LABEL);
1108     } else {
1109       ucomisd(dst, zero);
1110       jcc(Assembler::equal, DONE_LABEL);
1111     }
1112     movdbl(dst, one);
1113     jcc(Assembler::above, DONE_LABEL);
1114     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1115   }
1116 
1117   bind(DONE_LABEL);
1118 }
1119 
1120 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1121   if (sign) {
1122     pmovsxbw(dst, src);
1123   } else {
1124     pmovzxbw(dst, src);
1125   }
1126 }
1127 
1128 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1129   if (sign) {
1130     vpmovsxbw(dst, src, vector_len);
1131   } else {
1132     vpmovzxbw(dst, src, vector_len);
1133   }
1134 }
1135 
1136 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1137   if (sign) {
1138     vpmovsxbd(dst, src, vector_len);
1139   } else {
1140     vpmovzxbd(dst, src, vector_len);
1141   }
1142 }
1143 
1144 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1145   if (sign) {
1146     vpmovsxwd(dst, src, vector_len);
1147   } else {
1148     vpmovzxwd(dst, src, vector_len);
1149   }
1150 }
1151 
1152 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1153                                      int shift, int vector_len) {
1154   if (opcode == Op_RotateLeftV) {
1155     if (etype == T_INT) {
1156       evprold(dst, src, shift, vector_len);
1157     } else {
1158       assert(etype == T_LONG, "expected type T_LONG");
1159       evprolq(dst, src, shift, vector_len);
1160     }
1161   } else {
1162     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1163     if (etype == T_INT) {
1164       evprord(dst, src, shift, vector_len);
1165     } else {
1166       assert(etype == T_LONG, "expected type T_LONG");
1167       evprorq(dst, src, shift, vector_len);
1168     }
1169   }
1170 }
1171 
1172 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1173                                      XMMRegister shift, int vector_len) {
1174   if (opcode == Op_RotateLeftV) {
1175     if (etype == T_INT) {
1176       evprolvd(dst, src, shift, vector_len);
1177     } else {
1178       assert(etype == T_LONG, "expected type T_LONG");
1179       evprolvq(dst, src, shift, vector_len);
1180     }
1181   } else {
1182     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1183     if (etype == T_INT) {
1184       evprorvd(dst, src, shift, vector_len);
1185     } else {
1186       assert(etype == T_LONG, "expected type T_LONG");
1187       evprorvq(dst, src, shift, vector_len);
1188     }
1189   }
1190 }
1191 
1192 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1193   if (opcode == Op_RShiftVI) {
1194     psrad(dst, shift);
1195   } else if (opcode == Op_LShiftVI) {
1196     pslld(dst, shift);
1197   } else {
1198     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1199     psrld(dst, shift);
1200   }
1201 }
1202 
1203 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1204   switch (opcode) {
1205     case Op_RShiftVI:  psrad(dst, shift); break;
1206     case Op_LShiftVI:  pslld(dst, shift); break;
1207     case Op_URShiftVI: psrld(dst, shift); break;
1208 
1209     default: assert(false, "%s", NodeClassNames[opcode]);
1210   }
1211 }
1212 
1213 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1214   if (opcode == Op_RShiftVI) {
1215     vpsrad(dst, nds, shift, vector_len);
1216   } else if (opcode == Op_LShiftVI) {
1217     vpslld(dst, nds, shift, vector_len);
1218   } else {
1219     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1220     vpsrld(dst, nds, shift, vector_len);
1221   }
1222 }
1223 
1224 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1225   switch (opcode) {
1226     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1227     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1228     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1229 
1230     default: assert(false, "%s", NodeClassNames[opcode]);
1231   }
1232 }
1233 
1234 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1235   switch (opcode) {
1236     case Op_RShiftVB:  // fall-through
1237     case Op_RShiftVS:  psraw(dst, shift); break;
1238 
1239     case Op_LShiftVB:  // fall-through
1240     case Op_LShiftVS:  psllw(dst, shift);   break;
1241 
1242     case Op_URShiftVS: // fall-through
1243     case Op_URShiftVB: psrlw(dst, shift);  break;
1244 
1245     default: assert(false, "%s", NodeClassNames[opcode]);
1246   }
1247 }
1248 
1249 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1250   switch (opcode) {
1251     case Op_RShiftVB:  // fall-through
1252     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1253 
1254     case Op_LShiftVB:  // fall-through
1255     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1256 
1257     case Op_URShiftVS: // fall-through
1258     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1259 
1260     default: assert(false, "%s", NodeClassNames[opcode]);
1261   }
1262 }
1263 
1264 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1265   switch (opcode) {
1266     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1267     case Op_LShiftVL:  psllq(dst, shift); break;
1268     case Op_URShiftVL: psrlq(dst, shift); break;
1269 
1270     default: assert(false, "%s", NodeClassNames[opcode]);
1271   }
1272 }
1273 
1274 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1275   if (opcode == Op_RShiftVL) {
1276     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1277   } else if (opcode == Op_LShiftVL) {
1278     psllq(dst, shift);
1279   } else {
1280     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1281     psrlq(dst, shift);
1282   }
1283 }
1284 
1285 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1286   switch (opcode) {
1287     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1288     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1289     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1290 
1291     default: assert(false, "%s", NodeClassNames[opcode]);
1292   }
1293 }
1294 
1295 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1296   if (opcode == Op_RShiftVL) {
1297     evpsraq(dst, nds, shift, vector_len);
1298   } else if (opcode == Op_LShiftVL) {
1299     vpsllq(dst, nds, shift, vector_len);
1300   } else {
1301     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1302     vpsrlq(dst, nds, shift, vector_len);
1303   }
1304 }
1305 
1306 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1307   switch (opcode) {
1308     case Op_RShiftVB:  // fall-through
1309     case Op_RShiftVS:  // fall-through
1310     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1311 
1312     case Op_LShiftVB:  // fall-through
1313     case Op_LShiftVS:  // fall-through
1314     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1315 
1316     case Op_URShiftVB: // fall-through
1317     case Op_URShiftVS: // fall-through
1318     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1319 
1320     default: assert(false, "%s", NodeClassNames[opcode]);
1321   }
1322 }
1323 
1324 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1325   switch (opcode) {
1326     case Op_RShiftVB:  // fall-through
1327     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1328 
1329     case Op_LShiftVB:  // fall-through
1330     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1331 
1332     case Op_URShiftVB: // fall-through
1333     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1334 
1335     default: assert(false, "%s", NodeClassNames[opcode]);
1336   }
1337 }
1338 
1339 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1340   assert(UseAVX >= 2, "required");
1341   switch (opcode) {
1342     case Op_RShiftVL: {
1343       if (UseAVX > 2) {
1344         assert(tmp == xnoreg, "not used");
1345         if (!VM_Version::supports_avx512vl()) {
1346           vlen_enc = Assembler::AVX_512bit;
1347         }
1348         evpsravq(dst, src, shift, vlen_enc);
1349       } else {
1350         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1351         vpsrlvq(dst, src, shift, vlen_enc);
1352         vpsrlvq(tmp, tmp, shift, vlen_enc);
1353         vpxor(dst, dst, tmp, vlen_enc);
1354         vpsubq(dst, dst, tmp, vlen_enc);
1355       }
1356       break;
1357     }
1358     case Op_LShiftVL: {
1359       assert(tmp == xnoreg, "not used");
1360       vpsllvq(dst, src, shift, vlen_enc);
1361       break;
1362     }
1363     case Op_URShiftVL: {
1364       assert(tmp == xnoreg, "not used");
1365       vpsrlvq(dst, src, shift, vlen_enc);
1366       break;
1367     }
1368     default: assert(false, "%s", NodeClassNames[opcode]);
1369   }
1370 }
1371 
1372 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1373 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1374   assert(opcode == Op_LShiftVB ||
1375          opcode == Op_RShiftVB ||
1376          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1377   bool sign = (opcode != Op_URShiftVB);
1378   assert(vector_len == 0, "required");
1379   vextendbd(sign, dst, src, 1);
1380   vpmovzxbd(vtmp, shift, 1);
1381   varshiftd(opcode, dst, dst, vtmp, 1);
1382   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1383   vextracti128_high(vtmp, dst);
1384   vpackusdw(dst, dst, vtmp, 0);
1385 }
1386 
1387 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1388 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1389   assert(opcode == Op_LShiftVB ||
1390          opcode == Op_RShiftVB ||
1391          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1392   bool sign = (opcode != Op_URShiftVB);
1393   int ext_vector_len = vector_len + 1;
1394   vextendbw(sign, dst, src, ext_vector_len);
1395   vpmovzxbw(vtmp, shift, ext_vector_len);
1396   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1397   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1398   if (vector_len == 0) {
1399     vextracti128_high(vtmp, dst);
1400     vpackuswb(dst, dst, vtmp, vector_len);
1401   } else {
1402     vextracti64x4_high(vtmp, dst);
1403     vpackuswb(dst, dst, vtmp, vector_len);
1404     vpermq(dst, dst, 0xD8, vector_len);
1405   }
1406 }
1407 
1408 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1409   switch(typ) {
1410     case T_BYTE:
1411       pinsrb(dst, val, idx);
1412       break;
1413     case T_SHORT:
1414       pinsrw(dst, val, idx);
1415       break;
1416     case T_INT:
1417       pinsrd(dst, val, idx);
1418       break;
1419     case T_LONG:
1420       pinsrq(dst, val, idx);
1421       break;
1422     default:
1423       assert(false,"Should not reach here.");
1424       break;
1425   }
1426 }
1427 
1428 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1429   switch(typ) {
1430     case T_BYTE:
1431       vpinsrb(dst, src, val, idx);
1432       break;
1433     case T_SHORT:
1434       vpinsrw(dst, src, val, idx);
1435       break;
1436     case T_INT:
1437       vpinsrd(dst, src, val, idx);
1438       break;
1439     case T_LONG:
1440       vpinsrq(dst, src, val, idx);
1441       break;
1442     default:
1443       assert(false,"Should not reach here.");
1444       break;
1445   }
1446 }
1447 
1448 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst,
1449                                          Register base, Register idx_base,
1450                                          Register mask, Register mask_idx,
1451                                          Register rtmp, int vlen_enc) {
1452   vpxor(dst, dst, dst, vlen_enc);
1453   if (elem_bt == T_SHORT) {
1454     for (int i = 0; i < 4; i++) {
1455       // dst[i] = mask[i] ? src[idx_base[i]] : 0
1456       Label skip_load;
1457       btq(mask, mask_idx);
1458       jccb(Assembler::carryClear, skip_load);
1459       movl(rtmp, Address(idx_base, i * 4));
1460       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1461       bind(skip_load);
1462       incq(mask_idx);
1463     }
1464   } else {
1465     assert(elem_bt == T_BYTE, "");
1466     for (int i = 0; i < 8; i++) {
1467       // dst[i] = mask[i] ? src[idx_base[i]] : 0
1468       Label skip_load;
1469       btq(mask, mask_idx);
1470       jccb(Assembler::carryClear, skip_load);
1471       movl(rtmp, Address(idx_base, i * 4));
1472       pinsrb(dst, Address(base, rtmp), i);
1473       bind(skip_load);
1474       incq(mask_idx);
1475     }
1476   }
1477 }
1478 
1479 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst,
1480                                   Register base, Register idx_base,
1481                                   Register rtmp, int vlen_enc) {
1482   vpxor(dst, dst, dst, vlen_enc);
1483   if (elem_bt == T_SHORT) {
1484     for (int i = 0; i < 4; i++) {
1485       // dst[i] = src[idx_base[i]]
1486       movl(rtmp, Address(idx_base, i * 4));
1487       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1488     }
1489   } else {
1490     assert(elem_bt == T_BYTE, "");
1491     for (int i = 0; i < 8; i++) {
1492       // dst[i] = src[idx_base[i]]
1493       movl(rtmp, Address(idx_base, i * 4));
1494       pinsrb(dst, Address(base, rtmp), i);
1495     }
1496   }
1497 }
1498 
1499 /*
1500  * Gather using hybrid algorithm, first partially unroll scalar loop
1501  * to accumulate values from gather indices into a quad-word(64bit) slice.
1502  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1503  * permutation to place the slice into appropriate vector lane
1504  * locations in destination vector. Following pseudo code describes the
1505  * algorithm in detail:
1506  *
1507  * DST_VEC = ZERO_VEC
1508  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1509  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1510  * FOREACH_ITER:
1511  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1512  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1513  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1514  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1515  *
1516  * With each iteration, doubleword permute indices (0,1) corresponding
1517  * to gathered quadword gets right shifted by two lane positions.
1518  *
1519  */
1520 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1521                                         Register base, Register idx_base,
1522                                         Register mask, XMMRegister xtmp1,
1523                                         XMMRegister xtmp2, XMMRegister temp_dst,
1524                                         Register rtmp, Register mask_idx,
1525                                         Register length, int vector_len, int vlen_enc) {
1526   Label GATHER8_LOOP;
1527   assert(is_subword_type(elem_ty), "");
1528   movl(length, vector_len);
1529   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1530   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1531   vallones(xtmp2, vlen_enc);
1532   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1533   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1534   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1535 
1536   bind(GATHER8_LOOP);
1537     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1538     if (mask == noreg) {
1539       vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc);
1540     } else {
1541       vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc);
1542     }
1543     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1544     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1545     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1546     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1547     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1548     vpor(dst, dst, temp_dst, vlen_enc);
1549     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1550     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1551     jcc(Assembler::notEqual, GATHER8_LOOP);
1552 }
1553 
1554 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1555   switch(typ) {
1556     case T_INT:
1557       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1558       break;
1559     case T_FLOAT:
1560       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1561       break;
1562     case T_LONG:
1563       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1564       break;
1565     case T_DOUBLE:
1566       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1567       break;
1568     default:
1569       assert(false,"Should not reach here.");
1570       break;
1571   }
1572 }
1573 
1574 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1575   switch(typ) {
1576     case T_INT:
1577       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1578       break;
1579     case T_FLOAT:
1580       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1581       break;
1582     case T_LONG:
1583       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1584       break;
1585     case T_DOUBLE:
1586       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1587       break;
1588     default:
1589       assert(false,"Should not reach here.");
1590       break;
1591   }
1592 }
1593 
1594 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1595   switch(typ) {
1596     case T_INT:
1597       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1598       break;
1599     case T_FLOAT:
1600       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1601       break;
1602     case T_LONG:
1603       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1604       break;
1605     case T_DOUBLE:
1606       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1607       break;
1608     default:
1609       assert(false,"Should not reach here.");
1610       break;
1611   }
1612 }
1613 
1614 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1615   if (vlen_in_bytes <= 16) {
1616     pxor (dst, dst);
1617     psubb(dst, src);
1618     switch (elem_bt) {
1619       case T_BYTE:   /* nothing to do */ break;
1620       case T_SHORT:  pmovsxbw(dst, dst); break;
1621       case T_INT:    pmovsxbd(dst, dst); break;
1622       case T_FLOAT:  pmovsxbd(dst, dst); break;
1623       case T_LONG:   pmovsxbq(dst, dst); break;
1624       case T_DOUBLE: pmovsxbq(dst, dst); break;
1625 
1626       default: assert(false, "%s", type2name(elem_bt));
1627     }
1628   } else {
1629     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1630     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1631 
1632     vpxor (dst, dst, dst, vlen_enc);
1633     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1634 
1635     switch (elem_bt) {
1636       case T_BYTE:   /* nothing to do */            break;
1637       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1638       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1639       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1640       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1641       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1642 
1643       default: assert(false, "%s", type2name(elem_bt));
1644     }
1645   }
1646 }
1647 
1648 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1649   if (novlbwdq) {
1650     vpmovsxbd(xtmp, src, vlen_enc);
1651     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1652             Assembler::eq, true, vlen_enc, noreg);
1653   } else {
1654     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1655     vpsubb(xtmp, xtmp, src, vlen_enc);
1656     evpmovb2m(dst, xtmp, vlen_enc);
1657   }
1658 }
1659 
1660 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1661   if (is_integral_type(bt)) {
1662     switch (vlen_in_bytes) {
1663       case 4:  movdl(dst, src);   break;
1664       case 8:  movq(dst, src);    break;
1665       case 16: movdqu(dst, src);  break;
1666       case 32: vmovdqu(dst, src); break;
1667       case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1668       default: ShouldNotReachHere();
1669     }
1670   } else {
1671     switch (vlen_in_bytes) {
1672       case 4:  movflt(dst, src); break;
1673       case 8:  movdbl(dst, src); break;
1674       case 16: movups(dst, src); break;
1675       case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1676       case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1677       default: ShouldNotReachHere();
1678     }
1679   }
1680 }
1681 
1682 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1683   assert(rscratch != noreg || always_reachable(src), "missing");
1684 
1685   if (reachable(src)) {
1686     load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1687   } else {
1688     lea(rscratch, src);
1689     load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1690   }
1691 }
1692 
1693 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1694   int vlen_enc = vector_length_encoding(vlen);
1695   if (VM_Version::supports_avx()) {
1696     if (bt == T_LONG) {
1697       if (VM_Version::supports_avx2()) {
1698         vpbroadcastq(dst, src, vlen_enc);
1699       } else {
1700         vmovddup(dst, src, vlen_enc);
1701       }
1702     } else if (bt == T_DOUBLE) {
1703       if (vlen_enc != Assembler::AVX_128bit) {
1704         vbroadcastsd(dst, src, vlen_enc, noreg);
1705       } else {
1706         vmovddup(dst, src, vlen_enc);
1707       }
1708     } else {
1709       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1710         vpbroadcastd(dst, src, vlen_enc);
1711       } else {
1712         vbroadcastss(dst, src, vlen_enc);
1713       }
1714     }
1715   } else if (VM_Version::supports_sse3()) {
1716     movddup(dst, src);
1717   } else {
1718     load_vector(bt, dst, src, vlen);
1719   }
1720 }
1721 
1722 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1723   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1724   int offset = exact_log2(type2aelembytes(bt)) << 6;
1725   if (is_floating_point_type(bt)) {
1726     offset += 128;
1727   }
1728   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1729   load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1730 }
1731 
1732 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1733 
1734 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1735   int vector_len = Assembler::AVX_128bit;
1736 
1737   switch (opcode) {
1738     case Op_AndReductionV:  pand(dst, src); break;
1739     case Op_OrReductionV:   por (dst, src); break;
1740     case Op_XorReductionV:  pxor(dst, src); break;
1741     case Op_MinReductionV:
1742       switch (typ) {
1743         case T_BYTE:        pminsb(dst, src); break;
1744         case T_SHORT:       pminsw(dst, src); break;
1745         case T_INT:         pminsd(dst, src); break;
1746         case T_LONG:        assert(UseAVX > 2, "required");
1747                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1748         default:            assert(false, "wrong type");
1749       }
1750       break;
1751     case Op_MaxReductionV:
1752       switch (typ) {
1753         case T_BYTE:        pmaxsb(dst, src); break;
1754         case T_SHORT:       pmaxsw(dst, src); break;
1755         case T_INT:         pmaxsd(dst, src); break;
1756         case T_LONG:        assert(UseAVX > 2, "required");
1757                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1758         default:            assert(false, "wrong type");
1759       }
1760       break;
1761     case Op_AddReductionVF: addss(dst, src); break;
1762     case Op_AddReductionVD: addsd(dst, src); break;
1763     case Op_AddReductionVI:
1764       switch (typ) {
1765         case T_BYTE:        paddb(dst, src); break;
1766         case T_SHORT:       paddw(dst, src); break;
1767         case T_INT:         paddd(dst, src); break;
1768         default:            assert(false, "wrong type");
1769       }
1770       break;
1771     case Op_AddReductionVL: paddq(dst, src); break;
1772     case Op_MulReductionVF: mulss(dst, src); break;
1773     case Op_MulReductionVD: mulsd(dst, src); break;
1774     case Op_MulReductionVI:
1775       switch (typ) {
1776         case T_SHORT:       pmullw(dst, src); break;
1777         case T_INT:         pmulld(dst, src); break;
1778         default:            assert(false, "wrong type");
1779       }
1780       break;
1781     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1782                             evpmullq(dst, dst, src, vector_len); break;
1783     default:                assert(false, "wrong opcode");
1784   }
1785 }
1786 
1787 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1788   switch (opcode) {
1789     case Op_AddReductionVF: addps(dst, src); break;
1790     case Op_AddReductionVD: addpd(dst, src); break;
1791     case Op_MulReductionVF: mulps(dst, src); break;
1792     case Op_MulReductionVD: mulpd(dst, src); break;
1793     default:                assert(false, "%s", NodeClassNames[opcode]);
1794   }
1795 }
1796 
1797 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1798   int vector_len = Assembler::AVX_256bit;
1799 
1800   switch (opcode) {
1801     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1802     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1803     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1804     case Op_MinReductionV:
1805       switch (typ) {
1806         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1807         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1808         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1809         case T_LONG:        assert(UseAVX > 2, "required");
1810                             vpminsq(dst, src1, src2, vector_len); break;
1811         default:            assert(false, "wrong type");
1812       }
1813       break;
1814     case Op_MaxReductionV:
1815       switch (typ) {
1816         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1817         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1818         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1819         case T_LONG:        assert(UseAVX > 2, "required");
1820                             vpmaxsq(dst, src1, src2, vector_len); break;
1821         default:            assert(false, "wrong type");
1822       }
1823       break;
1824     case Op_AddReductionVI:
1825       switch (typ) {
1826         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1827         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1828         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1829         default:            assert(false, "wrong type");
1830       }
1831       break;
1832     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1833     case Op_MulReductionVI:
1834       switch (typ) {
1835         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1836         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1837         default:            assert(false, "wrong type");
1838       }
1839       break;
1840     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1841     default:                assert(false, "wrong opcode");
1842   }
1843 }
1844 
1845 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1846   int vector_len = Assembler::AVX_256bit;
1847 
1848   switch (opcode) {
1849     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1850     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1851     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1852     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1853     default:                assert(false, "%s", NodeClassNames[opcode]);
1854   }
1855 }
1856 
1857 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1858                                   XMMRegister dst, XMMRegister src,
1859                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1860   switch (opcode) {
1861     case Op_AddReductionVF:
1862     case Op_MulReductionVF:
1863       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1864       break;
1865 
1866     case Op_AddReductionVD:
1867     case Op_MulReductionVD:
1868       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1869       break;
1870 
1871     default: assert(false, "wrong opcode");
1872   }
1873 }
1874 
1875 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1876                                             XMMRegister dst, XMMRegister src,
1877                                             XMMRegister vtmp1, XMMRegister vtmp2) {
1878   switch (opcode) {
1879     case Op_AddReductionVF:
1880     case Op_MulReductionVF:
1881       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1882       break;
1883 
1884     case Op_AddReductionVD:
1885     case Op_MulReductionVD:
1886       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1887       break;
1888 
1889     default: assert(false, "%s", NodeClassNames[opcode]);
1890   }
1891 }
1892 
1893 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1894                              Register dst, Register src1, XMMRegister src2,
1895                              XMMRegister vtmp1, XMMRegister vtmp2) {
1896   switch (vlen) {
1897     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1898     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1899     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1900     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1901 
1902     default: assert(false, "wrong vector length");
1903   }
1904 }
1905 
1906 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1907                              Register dst, Register src1, XMMRegister src2,
1908                              XMMRegister vtmp1, XMMRegister vtmp2) {
1909   switch (vlen) {
1910     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1911     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1912     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1913     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1914 
1915     default: assert(false, "wrong vector length");
1916   }
1917 }
1918 
1919 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1920                              Register dst, Register src1, XMMRegister src2,
1921                              XMMRegister vtmp1, XMMRegister vtmp2) {
1922   switch (vlen) {
1923     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1924     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1925     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1926     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1927 
1928     default: assert(false, "wrong vector length");
1929   }
1930 }
1931 
1932 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1933                              Register dst, Register src1, XMMRegister src2,
1934                              XMMRegister vtmp1, XMMRegister vtmp2) {
1935   switch (vlen) {
1936     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1937     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1938     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1939     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1940 
1941     default: assert(false, "wrong vector length");
1942   }
1943 }
1944 
1945 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1946                              Register dst, Register src1, XMMRegister src2,
1947                              XMMRegister vtmp1, XMMRegister vtmp2) {
1948   switch (vlen) {
1949     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1950     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1951     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1952 
1953     default: assert(false, "wrong vector length");
1954   }
1955 }
1956 
1957 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1958   switch (vlen) {
1959     case 2:
1960       assert(vtmp2 == xnoreg, "");
1961       reduce2F(opcode, dst, src, vtmp1);
1962       break;
1963     case 4:
1964       assert(vtmp2 == xnoreg, "");
1965       reduce4F(opcode, dst, src, vtmp1);
1966       break;
1967     case 8:
1968       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1969       break;
1970     case 16:
1971       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1972       break;
1973     default: assert(false, "wrong vector length");
1974   }
1975 }
1976 
1977 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1978   switch (vlen) {
1979     case 2:
1980       assert(vtmp2 == xnoreg, "");
1981       reduce2D(opcode, dst, src, vtmp1);
1982       break;
1983     case 4:
1984       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1985       break;
1986     case 8:
1987       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1988       break;
1989     default: assert(false, "wrong vector length");
1990   }
1991 }
1992 
1993 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1994   switch (vlen) {
1995     case 2:
1996       assert(vtmp1 == xnoreg, "");
1997       assert(vtmp2 == xnoreg, "");
1998       unorderedReduce2F(opcode, dst, src);
1999       break;
2000     case 4:
2001       assert(vtmp2 == xnoreg, "");
2002       unorderedReduce4F(opcode, dst, src, vtmp1);
2003       break;
2004     case 8:
2005       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2006       break;
2007     case 16:
2008       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2009       break;
2010     default: assert(false, "wrong vector length");
2011   }
2012 }
2013 
2014 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2015   switch (vlen) {
2016     case 2:
2017       assert(vtmp1 == xnoreg, "");
2018       assert(vtmp2 == xnoreg, "");
2019       unorderedReduce2D(opcode, dst, src);
2020       break;
2021     case 4:
2022       assert(vtmp2 == xnoreg, "");
2023       unorderedReduce4D(opcode, dst, src, vtmp1);
2024       break;
2025     case 8:
2026       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2027       break;
2028     default: assert(false, "wrong vector length");
2029   }
2030 }
2031 
2032 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2033   if (opcode == Op_AddReductionVI) {
2034     if (vtmp1 != src2) {
2035       movdqu(vtmp1, src2);
2036     }
2037     phaddd(vtmp1, vtmp1);
2038   } else {
2039     pshufd(vtmp1, src2, 0x1);
2040     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2041   }
2042   movdl(vtmp2, src1);
2043   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2044   movdl(dst, vtmp1);
2045 }
2046 
2047 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2048   if (opcode == Op_AddReductionVI) {
2049     if (vtmp1 != src2) {
2050       movdqu(vtmp1, src2);
2051     }
2052     phaddd(vtmp1, src2);
2053     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2054   } else {
2055     pshufd(vtmp2, src2, 0xE);
2056     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2057     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2058   }
2059 }
2060 
2061 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2062   if (opcode == Op_AddReductionVI) {
2063     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2064     vextracti128_high(vtmp2, vtmp1);
2065     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2066     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2067   } else {
2068     vextracti128_high(vtmp1, src2);
2069     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2070     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2071   }
2072 }
2073 
2074 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2075   vextracti64x4_high(vtmp2, src2);
2076   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2077   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2078 }
2079 
2080 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2081   pshufd(vtmp2, src2, 0x1);
2082   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2083   movdqu(vtmp1, vtmp2);
2084   psrldq(vtmp1, 2);
2085   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2086   movdqu(vtmp2, vtmp1);
2087   psrldq(vtmp2, 1);
2088   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2089   movdl(vtmp2, src1);
2090   pmovsxbd(vtmp1, vtmp1);
2091   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2092   pextrb(dst, vtmp1, 0x0);
2093   movsbl(dst, dst);
2094 }
2095 
2096 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2097   pshufd(vtmp1, src2, 0xE);
2098   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2099   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2100 }
2101 
2102 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2103   vextracti128_high(vtmp2, src2);
2104   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2105   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2106 }
2107 
2108 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2109   vextracti64x4_high(vtmp1, src2);
2110   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2111   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2112 }
2113 
2114 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2115   pmovsxbw(vtmp2, src2);
2116   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2117 }
2118 
2119 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2120   if (UseAVX > 1) {
2121     int vector_len = Assembler::AVX_256bit;
2122     vpmovsxbw(vtmp1, src2, vector_len);
2123     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2124   } else {
2125     pmovsxbw(vtmp2, src2);
2126     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2127     pshufd(vtmp2, src2, 0x1);
2128     pmovsxbw(vtmp2, src2);
2129     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2130   }
2131 }
2132 
2133 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2134   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2135     int vector_len = Assembler::AVX_512bit;
2136     vpmovsxbw(vtmp1, src2, vector_len);
2137     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2138   } else {
2139     assert(UseAVX >= 2,"Should not reach here.");
2140     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2141     vextracti128_high(vtmp2, src2);
2142     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2143   }
2144 }
2145 
2146 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2147   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2148   vextracti64x4_high(vtmp2, src2);
2149   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2150 }
2151 
2152 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2153   if (opcode == Op_AddReductionVI) {
2154     if (vtmp1 != src2) {
2155       movdqu(vtmp1, src2);
2156     }
2157     phaddw(vtmp1, vtmp1);
2158     phaddw(vtmp1, vtmp1);
2159   } else {
2160     pshufd(vtmp2, src2, 0x1);
2161     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2162     movdqu(vtmp1, vtmp2);
2163     psrldq(vtmp1, 2);
2164     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2165   }
2166   movdl(vtmp2, src1);
2167   pmovsxwd(vtmp1, vtmp1);
2168   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2169   pextrw(dst, vtmp1, 0x0);
2170   movswl(dst, dst);
2171 }
2172 
2173 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2174   if (opcode == Op_AddReductionVI) {
2175     if (vtmp1 != src2) {
2176       movdqu(vtmp1, src2);
2177     }
2178     phaddw(vtmp1, src2);
2179   } else {
2180     pshufd(vtmp1, src2, 0xE);
2181     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2182   }
2183   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2184 }
2185 
2186 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2187   if (opcode == Op_AddReductionVI) {
2188     int vector_len = Assembler::AVX_256bit;
2189     vphaddw(vtmp2, src2, src2, vector_len);
2190     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2191   } else {
2192     vextracti128_high(vtmp2, src2);
2193     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2194   }
2195   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2196 }
2197 
2198 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2199   int vector_len = Assembler::AVX_256bit;
2200   vextracti64x4_high(vtmp1, src2);
2201   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2202   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2203 }
2204 
2205 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2206   pshufd(vtmp2, src2, 0xE);
2207   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2208   movdq(vtmp1, src1);
2209   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2210   movdq(dst, vtmp1);
2211 }
2212 
2213 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2214   vextracti128_high(vtmp1, src2);
2215   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2216   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2217 }
2218 
2219 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2220   vextracti64x4_high(vtmp2, src2);
2221   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2222   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2223 }
2224 
2225 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2226   mov64(temp, -1L);
2227   bzhiq(temp, temp, len);
2228   kmovql(dst, temp);
2229 }
2230 
2231 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2232   reduce_operation_128(T_FLOAT, opcode, dst, src);
2233   pshufd(vtmp, src, 0x1);
2234   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2235 }
2236 
2237 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2238   reduce2F(opcode, dst, src, vtmp);
2239   pshufd(vtmp, src, 0x2);
2240   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2241   pshufd(vtmp, src, 0x3);
2242   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2243 }
2244 
2245 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2246   reduce4F(opcode, dst, src, vtmp2);
2247   vextractf128_high(vtmp2, src);
2248   reduce4F(opcode, dst, vtmp2, vtmp1);
2249 }
2250 
2251 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2252   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2253   vextracti64x4_high(vtmp1, src);
2254   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2255 }
2256 
2257 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2258   pshufd(dst, src, 0x1);
2259   reduce_operation_128(T_FLOAT, opcode, dst, src);
2260 }
2261 
2262 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2263   pshufd(vtmp, src, 0xE);
2264   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2265   unorderedReduce2F(opcode, dst, vtmp);
2266 }
2267 
2268 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2269   vextractf128_high(vtmp1, src);
2270   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2271   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2272 }
2273 
2274 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2275   vextractf64x4_high(vtmp2, src);
2276   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2277   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2278 }
2279 
2280 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2281   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2282   pshufd(vtmp, src, 0xE);
2283   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2284 }
2285 
2286 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2287   reduce2D(opcode, dst, src, vtmp2);
2288   vextractf128_high(vtmp2, src);
2289   reduce2D(opcode, dst, vtmp2, vtmp1);
2290 }
2291 
2292 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2293   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2294   vextracti64x4_high(vtmp1, src);
2295   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2296 }
2297 
2298 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2299   pshufd(dst, src, 0xE);
2300   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2301 }
2302 
2303 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2304   vextractf128_high(vtmp, src);
2305   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2306   unorderedReduce2D(opcode, dst, vtmp);
2307 }
2308 
2309 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2310   vextractf64x4_high(vtmp2, src);
2311   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2312   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2313 }
2314 
2315 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2316   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2317 }
2318 
2319 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2320   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2321 }
2322 
2323 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2324   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2325 }
2326 
2327 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2328                                  int vec_enc) {
2329   switch(elem_bt) {
2330     case T_INT:
2331     case T_FLOAT:
2332       vmaskmovps(dst, src, mask, vec_enc);
2333       break;
2334     case T_LONG:
2335     case T_DOUBLE:
2336       vmaskmovpd(dst, src, mask, vec_enc);
2337       break;
2338     default:
2339       fatal("Unsupported type %s", type2name(elem_bt));
2340       break;
2341   }
2342 }
2343 
2344 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2345                                  int vec_enc) {
2346   switch(elem_bt) {
2347     case T_INT:
2348     case T_FLOAT:
2349       vmaskmovps(dst, src, mask, vec_enc);
2350       break;
2351     case T_LONG:
2352     case T_DOUBLE:
2353       vmaskmovpd(dst, src, mask, vec_enc);
2354       break;
2355     default:
2356       fatal("Unsupported type %s", type2name(elem_bt));
2357       break;
2358   }
2359 }
2360 
2361 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2362                                           XMMRegister dst, XMMRegister src,
2363                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2364                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2365   const int permconst[] = {1, 14};
2366   XMMRegister wsrc = src;
2367   XMMRegister wdst = xmm_0;
2368   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2369 
2370   int vlen_enc = Assembler::AVX_128bit;
2371   if (vlen == 16) {
2372     vlen_enc = Assembler::AVX_256bit;
2373   }
2374 
2375   for (int i = log2(vlen) - 1; i >=0; i--) {
2376     if (i == 0 && !is_dst_valid) {
2377       wdst = dst;
2378     }
2379     if (i == 3) {
2380       vextracti64x4_high(wtmp, wsrc);
2381     } else if (i == 2) {
2382       vextracti128_high(wtmp, wsrc);
2383     } else { // i = [0,1]
2384       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2385     }
2386 
2387     if (VM_Version::supports_avx10_2()) {
2388       vminmax_fp(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc);
2389     } else {
2390       vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2391     }
2392     wsrc = wdst;
2393     vlen_enc = Assembler::AVX_128bit;
2394   }
2395   if (is_dst_valid) {
2396     if (VM_Version::supports_avx10_2()) {
2397       vminmax_fp(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit);
2398     } else {
2399       vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2400     }
2401   }
2402 }
2403 
2404 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2405                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2406                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2407   XMMRegister wsrc = src;
2408   XMMRegister wdst = xmm_0;
2409   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2410   int vlen_enc = Assembler::AVX_128bit;
2411   if (vlen == 8) {
2412     vlen_enc = Assembler::AVX_256bit;
2413   }
2414   for (int i = log2(vlen) - 1; i >=0; i--) {
2415     if (i == 0 && !is_dst_valid) {
2416       wdst = dst;
2417     }
2418     if (i == 1) {
2419       vextracti128_high(wtmp, wsrc);
2420     } else if (i == 2) {
2421       vextracti64x4_high(wtmp, wsrc);
2422     } else {
2423       assert(i == 0, "%d", i);
2424       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2425     }
2426 
2427     if (VM_Version::supports_avx10_2()) {
2428       vminmax_fp(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc);
2429     } else {
2430       vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2431     }
2432 
2433     wsrc = wdst;
2434     vlen_enc = Assembler::AVX_128bit;
2435   }
2436 
2437   if (is_dst_valid) {
2438     if (VM_Version::supports_avx10_2()) {
2439       vminmax_fp(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit);
2440     } else {
2441       vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2442     }
2443   }
2444 }
2445 
2446 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2447   switch (bt) {
2448     case T_BYTE:  pextrb(dst, src, idx); break;
2449     case T_SHORT: pextrw(dst, src, idx); break;
2450     case T_INT:   pextrd(dst, src, idx); break;
2451     case T_LONG:  pextrq(dst, src, idx); break;
2452 
2453     default:
2454       assert(false,"Should not reach here.");
2455       break;
2456   }
2457 }
2458 
2459 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2460   int esize =  type2aelembytes(typ);
2461   int elem_per_lane = 16/esize;
2462   int lane = elemindex / elem_per_lane;
2463   int eindex = elemindex % elem_per_lane;
2464 
2465   if (lane >= 2) {
2466     assert(UseAVX > 2, "required");
2467     vextractf32x4(dst, src, lane & 3);
2468     return dst;
2469   } else if (lane > 0) {
2470     assert(UseAVX > 0, "required");
2471     vextractf128(dst, src, lane);
2472     return dst;
2473   } else {
2474     return src;
2475   }
2476 }
2477 
2478 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2479   if (typ == T_BYTE) {
2480     movsbl(dst, dst);
2481   } else if (typ == T_SHORT) {
2482     movswl(dst, dst);
2483   }
2484 }
2485 
2486 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2487   int esize =  type2aelembytes(typ);
2488   int elem_per_lane = 16/esize;
2489   int eindex = elemindex % elem_per_lane;
2490   assert(is_integral_type(typ),"required");
2491 
2492   if (eindex == 0) {
2493     if (typ == T_LONG) {
2494       movq(dst, src);
2495     } else {
2496       movdl(dst, src);
2497       movsxl(typ, dst);
2498     }
2499   } else {
2500     extract(typ, dst, src, eindex);
2501     movsxl(typ, dst);
2502   }
2503 }
2504 
2505 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2506   int esize =  type2aelembytes(typ);
2507   int elem_per_lane = 16/esize;
2508   int eindex = elemindex % elem_per_lane;
2509   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2510 
2511   if (eindex == 0) {
2512     movq(dst, src);
2513   } else {
2514     if (typ == T_FLOAT) {
2515       if (UseAVX == 0) {
2516         movdqu(dst, src);
2517         shufps(dst, dst, eindex);
2518       } else {
2519         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2520       }
2521     } else {
2522       if (UseAVX == 0) {
2523         movdqu(dst, src);
2524         psrldq(dst, eindex*esize);
2525       } else {
2526         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2527       }
2528       movq(dst, dst);
2529     }
2530   }
2531   // Zero upper bits
2532   if (typ == T_FLOAT) {
2533     if (UseAVX == 0) {
2534       assert(vtmp != xnoreg, "required.");
2535       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2536       pand(dst, vtmp);
2537     } else {
2538       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2539     }
2540   }
2541 }
2542 
2543 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2544   switch(typ) {
2545     case T_BYTE:
2546     case T_BOOLEAN:
2547       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2548       break;
2549     case T_SHORT:
2550     case T_CHAR:
2551       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2552       break;
2553     case T_INT:
2554     case T_FLOAT:
2555       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2556       break;
2557     case T_LONG:
2558     case T_DOUBLE:
2559       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2560       break;
2561     default:
2562       assert(false,"Should not reach here.");
2563       break;
2564   }
2565 }
2566 
2567 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2568   assert(rscratch != noreg || always_reachable(src2), "missing");
2569 
2570   switch(typ) {
2571     case T_BOOLEAN:
2572     case T_BYTE:
2573       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2574       break;
2575     case T_CHAR:
2576     case T_SHORT:
2577       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2578       break;
2579     case T_INT:
2580     case T_FLOAT:
2581       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2582       break;
2583     case T_LONG:
2584     case T_DOUBLE:
2585       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2586       break;
2587     default:
2588       assert(false,"Should not reach here.");
2589       break;
2590   }
2591 }
2592 
2593 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2594   switch(typ) {
2595     case T_BYTE:
2596       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2597       break;
2598     case T_SHORT:
2599       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2600       break;
2601     case T_INT:
2602     case T_FLOAT:
2603       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2604       break;
2605     case T_LONG:
2606     case T_DOUBLE:
2607       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2608       break;
2609     default:
2610       assert(false,"Should not reach here.");
2611       break;
2612   }
2613 }
2614 
2615 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2616   assert(vlen_in_bytes <= 32, "");
2617   int esize = type2aelembytes(bt);
2618   if (vlen_in_bytes == 32) {
2619     assert(vtmp == xnoreg, "required.");
2620     if (esize >= 4) {
2621       vtestps(src1, src2, AVX_256bit);
2622     } else {
2623       vptest(src1, src2, AVX_256bit);
2624     }
2625     return;
2626   }
2627   if (vlen_in_bytes < 16) {
2628     // Duplicate the lower part to fill the whole register,
2629     // Don't need to do so for src2
2630     assert(vtmp != xnoreg, "required");
2631     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2632     pshufd(vtmp, src1, shuffle_imm);
2633   } else {
2634     assert(vtmp == xnoreg, "required");
2635     vtmp = src1;
2636   }
2637   if (esize >= 4 && VM_Version::supports_avx()) {
2638     vtestps(vtmp, src2, AVX_128bit);
2639   } else {
2640     ptest(vtmp, src2);
2641   }
2642 }
2643 
2644 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2645 #ifdef ASSERT
2646   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2647   bool is_bw_supported = VM_Version::supports_avx512bw();
2648   if (is_bw && !is_bw_supported) {
2649     assert(vlen_enc != Assembler::AVX_512bit, "required");
2650     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2651            "XMM register should be 0-15");
2652   }
2653 #endif // ASSERT
2654   switch (elem_bt) {
2655     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2656     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2657     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2658     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2659     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2660     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2661     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2662   }
2663 }
2664 
2665 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2666   assert(UseAVX >= 2, "required");
2667   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2668   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2669   if ((UseAVX > 2) &&
2670       (!is_bw || VM_Version::supports_avx512bw()) &&
2671       (!is_vl || VM_Version::supports_avx512vl())) {
2672     switch (elem_bt) {
2673       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2674       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2675       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2676       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2677       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2678     }
2679   } else {
2680     assert(vlen_enc != Assembler::AVX_512bit, "required");
2681     assert((dst->encoding() < 16),"XMM register should be 0-15");
2682     switch (elem_bt) {
2683       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2684       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2685       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2686       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2687       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2688       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2689       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2690     }
2691   }
2692 }
2693 
2694 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2695   switch (to_elem_bt) {
2696     case T_SHORT:
2697       vpmovsxbw(dst, src, vlen_enc);
2698       break;
2699     case T_INT:
2700       vpmovsxbd(dst, src, vlen_enc);
2701       break;
2702     case T_FLOAT:
2703       vpmovsxbd(dst, src, vlen_enc);
2704       vcvtdq2ps(dst, dst, vlen_enc);
2705       break;
2706     case T_LONG:
2707       vpmovsxbq(dst, src, vlen_enc);
2708       break;
2709     case T_DOUBLE: {
2710       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2711       vpmovsxbd(dst, src, mid_vlen_enc);
2712       vcvtdq2pd(dst, dst, vlen_enc);
2713       break;
2714     }
2715     default:
2716       fatal("Unsupported type %s", type2name(to_elem_bt));
2717       break;
2718   }
2719 }
2720 
2721 //-------------------------------------------------------------------------------------------
2722 
2723 // IndexOf for constant substrings with size >= 8 chars
2724 // which don't need to be loaded through stack.
2725 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2726                                          Register cnt1, Register cnt2,
2727                                          int int_cnt2,  Register result,
2728                                          XMMRegister vec, Register tmp,
2729                                          int ae) {
2730   ShortBranchVerifier sbv(this);
2731   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2732   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2733 
2734   // This method uses the pcmpestri instruction with bound registers
2735   //   inputs:
2736   //     xmm - substring
2737   //     rax - substring length (elements count)
2738   //     mem - scanned string
2739   //     rdx - string length (elements count)
2740   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2741   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2742   //   outputs:
2743   //     rcx - matched index in string
2744   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2745   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2746   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2747   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2748   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2749 
2750   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2751         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2752         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2753 
2754   // Note, inline_string_indexOf() generates checks:
2755   // if (substr.count > string.count) return -1;
2756   // if (substr.count == 0) return 0;
2757   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2758 
2759   // Load substring.
2760   if (ae == StrIntrinsicNode::UL) {
2761     pmovzxbw(vec, Address(str2, 0));
2762   } else {
2763     movdqu(vec, Address(str2, 0));
2764   }
2765   movl(cnt2, int_cnt2);
2766   movptr(result, str1); // string addr
2767 
2768   if (int_cnt2 > stride) {
2769     jmpb(SCAN_TO_SUBSTR);
2770 
2771     // Reload substr for rescan, this code
2772     // is executed only for large substrings (> 8 chars)
2773     bind(RELOAD_SUBSTR);
2774     if (ae == StrIntrinsicNode::UL) {
2775       pmovzxbw(vec, Address(str2, 0));
2776     } else {
2777       movdqu(vec, Address(str2, 0));
2778     }
2779     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2780 
2781     bind(RELOAD_STR);
2782     // We came here after the beginning of the substring was
2783     // matched but the rest of it was not so we need to search
2784     // again. Start from the next element after the previous match.
2785 
2786     // cnt2 is number of substring reminding elements and
2787     // cnt1 is number of string reminding elements when cmp failed.
2788     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2789     subl(cnt1, cnt2);
2790     addl(cnt1, int_cnt2);
2791     movl(cnt2, int_cnt2); // Now restore cnt2
2792 
2793     decrementl(cnt1);     // Shift to next element
2794     cmpl(cnt1, cnt2);
2795     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2796 
2797     addptr(result, (1<<scale1));
2798 
2799   } // (int_cnt2 > 8)
2800 
2801   // Scan string for start of substr in 16-byte vectors
2802   bind(SCAN_TO_SUBSTR);
2803   pcmpestri(vec, Address(result, 0), mode);
2804   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2805   subl(cnt1, stride);
2806   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2807   cmpl(cnt1, cnt2);
2808   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2809   addptr(result, 16);
2810   jmpb(SCAN_TO_SUBSTR);
2811 
2812   // Found a potential substr
2813   bind(FOUND_CANDIDATE);
2814   // Matched whole vector if first element matched (tmp(rcx) == 0).
2815   if (int_cnt2 == stride) {
2816     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2817   } else { // int_cnt2 > 8
2818     jccb(Assembler::overflow, FOUND_SUBSTR);
2819   }
2820   // After pcmpestri tmp(rcx) contains matched element index
2821   // Compute start addr of substr
2822   lea(result, Address(result, tmp, scale1));
2823 
2824   // Make sure string is still long enough
2825   subl(cnt1, tmp);
2826   cmpl(cnt1, cnt2);
2827   if (int_cnt2 == stride) {
2828     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2829   } else { // int_cnt2 > 8
2830     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2831   }
2832   // Left less then substring.
2833 
2834   bind(RET_NOT_FOUND);
2835   movl(result, -1);
2836   jmp(EXIT);
2837 
2838   if (int_cnt2 > stride) {
2839     // This code is optimized for the case when whole substring
2840     // is matched if its head is matched.
2841     bind(MATCH_SUBSTR_HEAD);
2842     pcmpestri(vec, Address(result, 0), mode);
2843     // Reload only string if does not match
2844     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2845 
2846     Label CONT_SCAN_SUBSTR;
2847     // Compare the rest of substring (> 8 chars).
2848     bind(FOUND_SUBSTR);
2849     // First 8 chars are already matched.
2850     negptr(cnt2);
2851     addptr(cnt2, stride);
2852 
2853     bind(SCAN_SUBSTR);
2854     subl(cnt1, stride);
2855     cmpl(cnt2, -stride); // Do not read beyond substring
2856     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2857     // Back-up strings to avoid reading beyond substring:
2858     // cnt1 = cnt1 - cnt2 + 8
2859     addl(cnt1, cnt2); // cnt2 is negative
2860     addl(cnt1, stride);
2861     movl(cnt2, stride); negptr(cnt2);
2862     bind(CONT_SCAN_SUBSTR);
2863     if (int_cnt2 < (int)G) {
2864       int tail_off1 = int_cnt2<<scale1;
2865       int tail_off2 = int_cnt2<<scale2;
2866       if (ae == StrIntrinsicNode::UL) {
2867         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2868       } else {
2869         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2870       }
2871       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2872     } else {
2873       // calculate index in register to avoid integer overflow (int_cnt2*2)
2874       movl(tmp, int_cnt2);
2875       addptr(tmp, cnt2);
2876       if (ae == StrIntrinsicNode::UL) {
2877         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2878       } else {
2879         movdqu(vec, Address(str2, tmp, scale2, 0));
2880       }
2881       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2882     }
2883     // Need to reload strings pointers if not matched whole vector
2884     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2885     addptr(cnt2, stride);
2886     jcc(Assembler::negative, SCAN_SUBSTR);
2887     // Fall through if found full substring
2888 
2889   } // (int_cnt2 > 8)
2890 
2891   bind(RET_FOUND);
2892   // Found result if we matched full small substring.
2893   // Compute substr offset
2894   subptr(result, str1);
2895   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2896     shrl(result, 1); // index
2897   }
2898   bind(EXIT);
2899 
2900 } // string_indexofC8
2901 
2902 // Small strings are loaded through stack if they cross page boundary.
2903 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2904                                        Register cnt1, Register cnt2,
2905                                        int int_cnt2,  Register result,
2906                                        XMMRegister vec, Register tmp,
2907                                        int ae) {
2908   ShortBranchVerifier sbv(this);
2909   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2910   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2911 
2912   //
2913   // int_cnt2 is length of small (< 8 chars) constant substring
2914   // or (-1) for non constant substring in which case its length
2915   // is in cnt2 register.
2916   //
2917   // Note, inline_string_indexOf() generates checks:
2918   // if (substr.count > string.count) return -1;
2919   // if (substr.count == 0) return 0;
2920   //
2921   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2922   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2923   // This method uses the pcmpestri instruction with bound registers
2924   //   inputs:
2925   //     xmm - substring
2926   //     rax - substring length (elements count)
2927   //     mem - scanned string
2928   //     rdx - string length (elements count)
2929   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2930   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2931   //   outputs:
2932   //     rcx - matched index in string
2933   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2934   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2935   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2936   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2937 
2938   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2939         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2940         FOUND_CANDIDATE;
2941 
2942   { //========================================================
2943     // We don't know where these strings are located
2944     // and we can't read beyond them. Load them through stack.
2945     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2946 
2947     movptr(tmp, rsp); // save old SP
2948 
2949     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2950       if (int_cnt2 == (1>>scale2)) { // One byte
2951         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2952         load_unsigned_byte(result, Address(str2, 0));
2953         movdl(vec, result); // move 32 bits
2954       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2955         // Not enough header space in 32-bit VM: 12+3 = 15.
2956         movl(result, Address(str2, -1));
2957         shrl(result, 8);
2958         movdl(vec, result); // move 32 bits
2959       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2960         load_unsigned_short(result, Address(str2, 0));
2961         movdl(vec, result); // move 32 bits
2962       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2963         movdl(vec, Address(str2, 0)); // move 32 bits
2964       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2965         movq(vec, Address(str2, 0));  // move 64 bits
2966       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2967         // Array header size is 12 bytes in 32-bit VM
2968         // + 6 bytes for 3 chars == 18 bytes,
2969         // enough space to load vec and shift.
2970         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2971         if (ae == StrIntrinsicNode::UL) {
2972           int tail_off = int_cnt2-8;
2973           pmovzxbw(vec, Address(str2, tail_off));
2974           psrldq(vec, -2*tail_off);
2975         }
2976         else {
2977           int tail_off = int_cnt2*(1<<scale2);
2978           movdqu(vec, Address(str2, tail_off-16));
2979           psrldq(vec, 16-tail_off);
2980         }
2981       }
2982     } else { // not constant substring
2983       cmpl(cnt2, stride);
2984       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2985 
2986       // We can read beyond string if srt+16 does not cross page boundary
2987       // since heaps are aligned and mapped by pages.
2988       assert(os::vm_page_size() < (int)G, "default page should be small");
2989       movl(result, str2); // We need only low 32 bits
2990       andl(result, ((int)os::vm_page_size()-1));
2991       cmpl(result, ((int)os::vm_page_size()-16));
2992       jccb(Assembler::belowEqual, CHECK_STR);
2993 
2994       // Move small strings to stack to allow load 16 bytes into vec.
2995       subptr(rsp, 16);
2996       int stk_offset = wordSize-(1<<scale2);
2997       push(cnt2);
2998 
2999       bind(COPY_SUBSTR);
3000       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3001         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3002         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3003       } else if (ae == StrIntrinsicNode::UU) {
3004         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3005         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3006       }
3007       decrement(cnt2);
3008       jccb(Assembler::notZero, COPY_SUBSTR);
3009 
3010       pop(cnt2);
3011       movptr(str2, rsp);  // New substring address
3012     } // non constant
3013 
3014     bind(CHECK_STR);
3015     cmpl(cnt1, stride);
3016     jccb(Assembler::aboveEqual, BIG_STRINGS);
3017 
3018     // Check cross page boundary.
3019     movl(result, str1); // We need only low 32 bits
3020     andl(result, ((int)os::vm_page_size()-1));
3021     cmpl(result, ((int)os::vm_page_size()-16));
3022     jccb(Assembler::belowEqual, BIG_STRINGS);
3023 
3024     subptr(rsp, 16);
3025     int stk_offset = -(1<<scale1);
3026     if (int_cnt2 < 0) { // not constant
3027       push(cnt2);
3028       stk_offset += wordSize;
3029     }
3030     movl(cnt2, cnt1);
3031 
3032     bind(COPY_STR);
3033     if (ae == StrIntrinsicNode::LL) {
3034       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3035       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3036     } else {
3037       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3038       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3039     }
3040     decrement(cnt2);
3041     jccb(Assembler::notZero, COPY_STR);
3042 
3043     if (int_cnt2 < 0) { // not constant
3044       pop(cnt2);
3045     }
3046     movptr(str1, rsp);  // New string address
3047 
3048     bind(BIG_STRINGS);
3049     // Load substring.
3050     if (int_cnt2 < 0) { // -1
3051       if (ae == StrIntrinsicNode::UL) {
3052         pmovzxbw(vec, Address(str2, 0));
3053       } else {
3054         movdqu(vec, Address(str2, 0));
3055       }
3056       push(cnt2);       // substr count
3057       push(str2);       // substr addr
3058       push(str1);       // string addr
3059     } else {
3060       // Small (< 8 chars) constant substrings are loaded already.
3061       movl(cnt2, int_cnt2);
3062     }
3063     push(tmp);  // original SP
3064 
3065   } // Finished loading
3066 
3067   //========================================================
3068   // Start search
3069   //
3070 
3071   movptr(result, str1); // string addr
3072 
3073   if (int_cnt2  < 0) {  // Only for non constant substring
3074     jmpb(SCAN_TO_SUBSTR);
3075 
3076     // SP saved at sp+0
3077     // String saved at sp+1*wordSize
3078     // Substr saved at sp+2*wordSize
3079     // Substr count saved at sp+3*wordSize
3080 
3081     // Reload substr for rescan, this code
3082     // is executed only for large substrings (> 8 chars)
3083     bind(RELOAD_SUBSTR);
3084     movptr(str2, Address(rsp, 2*wordSize));
3085     movl(cnt2, Address(rsp, 3*wordSize));
3086     if (ae == StrIntrinsicNode::UL) {
3087       pmovzxbw(vec, Address(str2, 0));
3088     } else {
3089       movdqu(vec, Address(str2, 0));
3090     }
3091     // We came here after the beginning of the substring was
3092     // matched but the rest of it was not so we need to search
3093     // again. Start from the next element after the previous match.
3094     subptr(str1, result); // Restore counter
3095     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3096       shrl(str1, 1);
3097     }
3098     addl(cnt1, str1);
3099     decrementl(cnt1);   // Shift to next element
3100     cmpl(cnt1, cnt2);
3101     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3102 
3103     addptr(result, (1<<scale1));
3104   } // non constant
3105 
3106   // Scan string for start of substr in 16-byte vectors
3107   bind(SCAN_TO_SUBSTR);
3108   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3109   pcmpestri(vec, Address(result, 0), mode);
3110   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3111   subl(cnt1, stride);
3112   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3113   cmpl(cnt1, cnt2);
3114   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3115   addptr(result, 16);
3116 
3117   bind(ADJUST_STR);
3118   cmpl(cnt1, stride); // Do not read beyond string
3119   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3120   // Back-up string to avoid reading beyond string.
3121   lea(result, Address(result, cnt1, scale1, -16));
3122   movl(cnt1, stride);
3123   jmpb(SCAN_TO_SUBSTR);
3124 
3125   // Found a potential substr
3126   bind(FOUND_CANDIDATE);
3127   // After pcmpestri tmp(rcx) contains matched element index
3128 
3129   // Make sure string is still long enough
3130   subl(cnt1, tmp);
3131   cmpl(cnt1, cnt2);
3132   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3133   // Left less then substring.
3134 
3135   bind(RET_NOT_FOUND);
3136   movl(result, -1);
3137   jmp(CLEANUP);
3138 
3139   bind(FOUND_SUBSTR);
3140   // Compute start addr of substr
3141   lea(result, Address(result, tmp, scale1));
3142   if (int_cnt2 > 0) { // Constant substring
3143     // Repeat search for small substring (< 8 chars)
3144     // from new point without reloading substring.
3145     // Have to check that we don't read beyond string.
3146     cmpl(tmp, stride-int_cnt2);
3147     jccb(Assembler::greater, ADJUST_STR);
3148     // Fall through if matched whole substring.
3149   } else { // non constant
3150     assert(int_cnt2 == -1, "should be != 0");
3151 
3152     addl(tmp, cnt2);
3153     // Found result if we matched whole substring.
3154     cmpl(tmp, stride);
3155     jcc(Assembler::lessEqual, RET_FOUND);
3156 
3157     // Repeat search for small substring (<= 8 chars)
3158     // from new point 'str1' without reloading substring.
3159     cmpl(cnt2, stride);
3160     // Have to check that we don't read beyond string.
3161     jccb(Assembler::lessEqual, ADJUST_STR);
3162 
3163     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3164     // Compare the rest of substring (> 8 chars).
3165     movptr(str1, result);
3166 
3167     cmpl(tmp, cnt2);
3168     // First 8 chars are already matched.
3169     jccb(Assembler::equal, CHECK_NEXT);
3170 
3171     bind(SCAN_SUBSTR);
3172     pcmpestri(vec, Address(str1, 0), mode);
3173     // Need to reload strings pointers if not matched whole vector
3174     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3175 
3176     bind(CHECK_NEXT);
3177     subl(cnt2, stride);
3178     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3179     addptr(str1, 16);
3180     if (ae == StrIntrinsicNode::UL) {
3181       addptr(str2, 8);
3182     } else {
3183       addptr(str2, 16);
3184     }
3185     subl(cnt1, stride);
3186     cmpl(cnt2, stride); // Do not read beyond substring
3187     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3188     // Back-up strings to avoid reading beyond substring.
3189 
3190     if (ae == StrIntrinsicNode::UL) {
3191       lea(str2, Address(str2, cnt2, scale2, -8));
3192       lea(str1, Address(str1, cnt2, scale1, -16));
3193     } else {
3194       lea(str2, Address(str2, cnt2, scale2, -16));
3195       lea(str1, Address(str1, cnt2, scale1, -16));
3196     }
3197     subl(cnt1, cnt2);
3198     movl(cnt2, stride);
3199     addl(cnt1, stride);
3200     bind(CONT_SCAN_SUBSTR);
3201     if (ae == StrIntrinsicNode::UL) {
3202       pmovzxbw(vec, Address(str2, 0));
3203     } else {
3204       movdqu(vec, Address(str2, 0));
3205     }
3206     jmp(SCAN_SUBSTR);
3207 
3208     bind(RET_FOUND_LONG);
3209     movptr(str1, Address(rsp, wordSize));
3210   } // non constant
3211 
3212   bind(RET_FOUND);
3213   // Compute substr offset
3214   subptr(result, str1);
3215   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3216     shrl(result, 1); // index
3217   }
3218   bind(CLEANUP);
3219   pop(rsp); // restore SP
3220 
3221 } // string_indexof
3222 
3223 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3224                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3225   ShortBranchVerifier sbv(this);
3226   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3227 
3228   int stride = 8;
3229 
3230   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3231         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3232         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3233         FOUND_SEQ_CHAR, DONE_LABEL;
3234 
3235   movptr(result, str1);
3236   if (UseAVX >= 2) {
3237     cmpl(cnt1, stride);
3238     jcc(Assembler::less, SCAN_TO_CHAR);
3239     cmpl(cnt1, 2*stride);
3240     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3241     movdl(vec1, ch);
3242     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3243     vpxor(vec2, vec2);
3244     movl(tmp, cnt1);
3245     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3246     andl(cnt1,0x0000000F);  //tail count (in chars)
3247 
3248     bind(SCAN_TO_16_CHAR_LOOP);
3249     vmovdqu(vec3, Address(result, 0));
3250     vpcmpeqw(vec3, vec3, vec1, 1);
3251     vptest(vec2, vec3);
3252     jcc(Assembler::carryClear, FOUND_CHAR);
3253     addptr(result, 32);
3254     subl(tmp, 2*stride);
3255     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3256     jmp(SCAN_TO_8_CHAR);
3257     bind(SCAN_TO_8_CHAR_INIT);
3258     movdl(vec1, ch);
3259     pshuflw(vec1, vec1, 0x00);
3260     pshufd(vec1, vec1, 0);
3261     pxor(vec2, vec2);
3262   }
3263   bind(SCAN_TO_8_CHAR);
3264   cmpl(cnt1, stride);
3265   jcc(Assembler::less, SCAN_TO_CHAR);
3266   if (UseAVX < 2) {
3267     movdl(vec1, ch);
3268     pshuflw(vec1, vec1, 0x00);
3269     pshufd(vec1, vec1, 0);
3270     pxor(vec2, vec2);
3271   }
3272   movl(tmp, cnt1);
3273   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3274   andl(cnt1,0x00000007);  //tail count (in chars)
3275 
3276   bind(SCAN_TO_8_CHAR_LOOP);
3277   movdqu(vec3, Address(result, 0));
3278   pcmpeqw(vec3, vec1);
3279   ptest(vec2, vec3);
3280   jcc(Assembler::carryClear, FOUND_CHAR);
3281   addptr(result, 16);
3282   subl(tmp, stride);
3283   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3284   bind(SCAN_TO_CHAR);
3285   testl(cnt1, cnt1);
3286   jcc(Assembler::zero, RET_NOT_FOUND);
3287   bind(SCAN_TO_CHAR_LOOP);
3288   load_unsigned_short(tmp, Address(result, 0));
3289   cmpl(ch, tmp);
3290   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3291   addptr(result, 2);
3292   subl(cnt1, 1);
3293   jccb(Assembler::zero, RET_NOT_FOUND);
3294   jmp(SCAN_TO_CHAR_LOOP);
3295 
3296   bind(RET_NOT_FOUND);
3297   movl(result, -1);
3298   jmpb(DONE_LABEL);
3299 
3300   bind(FOUND_CHAR);
3301   if (UseAVX >= 2) {
3302     vpmovmskb(tmp, vec3);
3303   } else {
3304     pmovmskb(tmp, vec3);
3305   }
3306   bsfl(ch, tmp);
3307   addptr(result, ch);
3308 
3309   bind(FOUND_SEQ_CHAR);
3310   subptr(result, str1);
3311   shrl(result, 1);
3312 
3313   bind(DONE_LABEL);
3314 } // string_indexof_char
3315 
3316 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3317                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3318   ShortBranchVerifier sbv(this);
3319   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3320 
3321   int stride = 16;
3322 
3323   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3324         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3325         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3326         FOUND_SEQ_CHAR, DONE_LABEL;
3327 
3328   movptr(result, str1);
3329   if (UseAVX >= 2) {
3330     cmpl(cnt1, stride);
3331     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3332     cmpl(cnt1, stride*2);
3333     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3334     movdl(vec1, ch);
3335     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3336     vpxor(vec2, vec2);
3337     movl(tmp, cnt1);
3338     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3339     andl(cnt1,0x0000001F);  //tail count (in chars)
3340 
3341     bind(SCAN_TO_32_CHAR_LOOP);
3342     vmovdqu(vec3, Address(result, 0));
3343     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3344     vptest(vec2, vec3);
3345     jcc(Assembler::carryClear, FOUND_CHAR);
3346     addptr(result, 32);
3347     subl(tmp, stride*2);
3348     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3349     jmp(SCAN_TO_16_CHAR);
3350 
3351     bind(SCAN_TO_16_CHAR_INIT);
3352     movdl(vec1, ch);
3353     pxor(vec2, vec2);
3354     pshufb(vec1, vec2);
3355   }
3356 
3357   bind(SCAN_TO_16_CHAR);
3358   cmpl(cnt1, stride);
3359   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3360   if (UseAVX < 2) {
3361     movdl(vec1, ch);
3362     pxor(vec2, vec2);
3363     pshufb(vec1, vec2);
3364   }
3365   movl(tmp, cnt1);
3366   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3367   andl(cnt1,0x0000000F);  //tail count (in bytes)
3368 
3369   bind(SCAN_TO_16_CHAR_LOOP);
3370   movdqu(vec3, Address(result, 0));
3371   pcmpeqb(vec3, vec1);
3372   ptest(vec2, vec3);
3373   jcc(Assembler::carryClear, FOUND_CHAR);
3374   addptr(result, 16);
3375   subl(tmp, stride);
3376   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3377 
3378   bind(SCAN_TO_CHAR_INIT);
3379   testl(cnt1, cnt1);
3380   jcc(Assembler::zero, RET_NOT_FOUND);
3381   bind(SCAN_TO_CHAR_LOOP);
3382   load_unsigned_byte(tmp, Address(result, 0));
3383   cmpl(ch, tmp);
3384   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3385   addptr(result, 1);
3386   subl(cnt1, 1);
3387   jccb(Assembler::zero, RET_NOT_FOUND);
3388   jmp(SCAN_TO_CHAR_LOOP);
3389 
3390   bind(RET_NOT_FOUND);
3391   movl(result, -1);
3392   jmpb(DONE_LABEL);
3393 
3394   bind(FOUND_CHAR);
3395   if (UseAVX >= 2) {
3396     vpmovmskb(tmp, vec3);
3397   } else {
3398     pmovmskb(tmp, vec3);
3399   }
3400   bsfl(ch, tmp);
3401   addptr(result, ch);
3402 
3403   bind(FOUND_SEQ_CHAR);
3404   subptr(result, str1);
3405 
3406   bind(DONE_LABEL);
3407 } // stringL_indexof_char
3408 
3409 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3410   switch (eltype) {
3411   case T_BOOLEAN: return sizeof(jboolean);
3412   case T_BYTE:  return sizeof(jbyte);
3413   case T_SHORT: return sizeof(jshort);
3414   case T_CHAR:  return sizeof(jchar);
3415   case T_INT:   return sizeof(jint);
3416   default:
3417     ShouldNotReachHere();
3418     return -1;
3419   }
3420 }
3421 
3422 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3423   switch (eltype) {
3424   // T_BOOLEAN used as surrogate for unsigned byte
3425   case T_BOOLEAN: movzbl(dst, src);   break;
3426   case T_BYTE:    movsbl(dst, src);   break;
3427   case T_SHORT:   movswl(dst, src);   break;
3428   case T_CHAR:    movzwl(dst, src);   break;
3429   case T_INT:     movl(dst, src);     break;
3430   default:
3431     ShouldNotReachHere();
3432   }
3433 }
3434 
3435 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3436   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3437 }
3438 
3439 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3440   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3441 }
3442 
3443 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3444   const int vlen = Assembler::AVX_256bit;
3445   switch (eltype) {
3446   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3447   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3448   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3449   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3450   case T_INT:
3451     // do nothing
3452     break;
3453   default:
3454     ShouldNotReachHere();
3455   }
3456 }
3457 
3458 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3459                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3460                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3461                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3462                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3463                                         BasicType eltype) {
3464   ShortBranchVerifier sbv(this);
3465   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3466   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3467   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3468 
3469   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3470         SHORT_UNROLLED_LOOP_EXIT,
3471         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3472         UNROLLED_VECTOR_LOOP_BEGIN,
3473         END;
3474   switch (eltype) {
3475   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3476   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3477   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3478   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3479   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3480   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3481   }
3482 
3483   // For "renaming" for readibility of the code
3484   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3485                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3486                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3487 
3488   const int elsize = arrays_hashcode_elsize(eltype);
3489 
3490   /*
3491     if (cnt1 >= 2) {
3492       if (cnt1 >= 32) {
3493         UNROLLED VECTOR LOOP
3494       }
3495       UNROLLED SCALAR LOOP
3496     }
3497     SINGLE SCALAR
3498    */
3499 
3500   cmpl(cnt1, 32);
3501   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3502 
3503   // cnt1 >= 32 && generate_vectorized_loop
3504   xorl(index, index);
3505 
3506   // vresult = IntVector.zero(I256);
3507   for (int idx = 0; idx < 4; idx++) {
3508     vpxor(vresult[idx], vresult[idx]);
3509   }
3510   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3511   Register bound = tmp2;
3512   Register next = tmp3;
3513   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3514   movl(next, Address(tmp2, 0));
3515   movdl(vnext, next);
3516   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3517 
3518   // index = 0;
3519   // bound = cnt1 & ~(32 - 1);
3520   movl(bound, cnt1);
3521   andl(bound, ~(32 - 1));
3522   // for (; index < bound; index += 32) {
3523   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3524   // result *= next;
3525   imull(result, next);
3526   // loop fission to upfront the cost of fetching from memory, OOO execution
3527   // can then hopefully do a better job of prefetching
3528   for (int idx = 0; idx < 4; idx++) {
3529     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3530   }
3531   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3532   for (int idx = 0; idx < 4; idx++) {
3533     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3534     arrays_hashcode_elvcast(vtmp[idx], eltype);
3535     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3536   }
3537   // index += 32;
3538   addl(index, 32);
3539   // index < bound;
3540   cmpl(index, bound);
3541   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3542   // }
3543 
3544   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3545   subl(cnt1, bound);
3546   // release bound
3547 
3548   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3549   for (int idx = 0; idx < 4; idx++) {
3550     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3551     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3552     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3553   }
3554   // result += vresult.reduceLanes(ADD);
3555   for (int idx = 0; idx < 4; idx++) {
3556     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3557   }
3558 
3559   // } else if (cnt1 < 32) {
3560 
3561   bind(SHORT_UNROLLED_BEGIN);
3562   // int i = 1;
3563   movl(index, 1);
3564   cmpl(index, cnt1);
3565   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3566 
3567   // for (; i < cnt1 ; i += 2) {
3568   bind(SHORT_UNROLLED_LOOP_BEGIN);
3569   movl(tmp3, 961);
3570   imull(result, tmp3);
3571   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3572   movl(tmp3, tmp2);
3573   shll(tmp3, 5);
3574   subl(tmp3, tmp2);
3575   addl(result, tmp3);
3576   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3577   addl(result, tmp3);
3578   addl(index, 2);
3579   cmpl(index, cnt1);
3580   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3581 
3582   // }
3583   // if (i >= cnt1) {
3584   bind(SHORT_UNROLLED_LOOP_EXIT);
3585   jccb(Assembler::greater, END);
3586   movl(tmp2, result);
3587   shll(result, 5);
3588   subl(result, tmp2);
3589   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3590   addl(result, tmp3);
3591   // }
3592   bind(END);
3593 
3594   BLOCK_COMMENT("} // arrays_hashcode");
3595 
3596 } // arrays_hashcode
3597 
3598 // helper function for string_compare
3599 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3600                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3601                                            Address::ScaleFactor scale2, Register index, int ae) {
3602   if (ae == StrIntrinsicNode::LL) {
3603     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3604     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3605   } else if (ae == StrIntrinsicNode::UU) {
3606     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3607     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3608   } else {
3609     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3610     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3611   }
3612 }
3613 
3614 // Compare strings, used for char[] and byte[].
3615 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3616                                        Register cnt1, Register cnt2, Register result,
3617                                        XMMRegister vec1, int ae, KRegister mask) {
3618   ShortBranchVerifier sbv(this);
3619   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3620   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only AVX3
3621   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3622   int stride2x2 = 0x40;
3623   Address::ScaleFactor scale = Address::no_scale;
3624   Address::ScaleFactor scale1 = Address::no_scale;
3625   Address::ScaleFactor scale2 = Address::no_scale;
3626 
3627   if (ae != StrIntrinsicNode::LL) {
3628     stride2x2 = 0x20;
3629   }
3630 
3631   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3632     shrl(cnt2, 1);
3633   }
3634   // Compute the minimum of the string lengths and the
3635   // difference of the string lengths (stack).
3636   // Do the conditional move stuff
3637   movl(result, cnt1);
3638   subl(cnt1, cnt2);
3639   push(cnt1);
3640   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3641 
3642   // Is the minimum length zero?
3643   testl(cnt2, cnt2);
3644   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3645   if (ae == StrIntrinsicNode::LL) {
3646     // Load first bytes
3647     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3648     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3649   } else if (ae == StrIntrinsicNode::UU) {
3650     // Load first characters
3651     load_unsigned_short(result, Address(str1, 0));
3652     load_unsigned_short(cnt1, Address(str2, 0));
3653   } else {
3654     load_unsigned_byte(result, Address(str1, 0));
3655     load_unsigned_short(cnt1, Address(str2, 0));
3656   }
3657   subl(result, cnt1);
3658   jcc(Assembler::notZero,  POP_LABEL);
3659 
3660   if (ae == StrIntrinsicNode::UU) {
3661     // Divide length by 2 to get number of chars
3662     shrl(cnt2, 1);
3663   }
3664   cmpl(cnt2, 1);
3665   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3666 
3667   // Check if the strings start at the same location and setup scale and stride
3668   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3669     cmpptr(str1, str2);
3670     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3671     if (ae == StrIntrinsicNode::LL) {
3672       scale = Address::times_1;
3673       stride = 16;
3674     } else {
3675       scale = Address::times_2;
3676       stride = 8;
3677     }
3678   } else {
3679     scale1 = Address::times_1;
3680     scale2 = Address::times_2;
3681     // scale not used
3682     stride = 8;
3683   }
3684 
3685   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3686     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3687     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3688     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3689     Label COMPARE_TAIL_LONG;
3690     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only AVX3
3691 
3692     int pcmpmask = 0x19;
3693     if (ae == StrIntrinsicNode::LL) {
3694       pcmpmask &= ~0x01;
3695     }
3696 
3697     // Setup to compare 16-chars (32-bytes) vectors,
3698     // start from first character again because it has aligned address.
3699     if (ae == StrIntrinsicNode::LL) {
3700       stride2 = 32;
3701     } else {
3702       stride2 = 16;
3703     }
3704     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3705       adr_stride = stride << scale;
3706     } else {
3707       adr_stride1 = 8;  //stride << scale1;
3708       adr_stride2 = 16; //stride << scale2;
3709     }
3710 
3711     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3712     // rax and rdx are used by pcmpestri as elements counters
3713     movl(result, cnt2);
3714     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3715     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3716 
3717     // fast path : compare first 2 8-char vectors.
3718     bind(COMPARE_16_CHARS);
3719     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3720       movdqu(vec1, Address(str1, 0));
3721     } else {
3722       pmovzxbw(vec1, Address(str1, 0));
3723     }
3724     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3725     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3726 
3727     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3728       movdqu(vec1, Address(str1, adr_stride));
3729       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3730     } else {
3731       pmovzxbw(vec1, Address(str1, adr_stride1));
3732       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3733     }
3734     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3735     addl(cnt1, stride);
3736 
3737     // Compare the characters at index in cnt1
3738     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3739     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3740     subl(result, cnt2);
3741     jmp(POP_LABEL);
3742 
3743     // Setup the registers to start vector comparison loop
3744     bind(COMPARE_WIDE_VECTORS);
3745     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3746       lea(str1, Address(str1, result, scale));
3747       lea(str2, Address(str2, result, scale));
3748     } else {
3749       lea(str1, Address(str1, result, scale1));
3750       lea(str2, Address(str2, result, scale2));
3751     }
3752     subl(result, stride2);
3753     subl(cnt2, stride2);
3754     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3755     negptr(result);
3756 
3757     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3758     bind(COMPARE_WIDE_VECTORS_LOOP);
3759 
3760     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3761       cmpl(cnt2, stride2x2);
3762       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3763       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3764       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3765 
3766       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3767       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3768         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3769         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3770       } else {
3771         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3772         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3773       }
3774       kortestql(mask, mask);
3775       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3776       addptr(result, stride2x2);  // update since we already compared at this addr
3777       subl(cnt2, stride2x2);      // and sub the size too
3778       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3779 
3780       vpxor(vec1, vec1);
3781       jmpb(COMPARE_WIDE_TAIL);
3782     }//if (VM_Version::supports_avx512vlbw())
3783 
3784     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3785     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3786       vmovdqu(vec1, Address(str1, result, scale));
3787       vpxor(vec1, Address(str2, result, scale));
3788     } else {
3789       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3790       vpxor(vec1, Address(str2, result, scale2));
3791     }
3792     vptest(vec1, vec1);
3793     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3794     addptr(result, stride2);
3795     subl(cnt2, stride2);
3796     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3797     // clean upper bits of YMM registers
3798     vpxor(vec1, vec1);
3799 
3800     // compare wide vectors tail
3801     bind(COMPARE_WIDE_TAIL);
3802     testptr(result, result);
3803     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3804 
3805     movl(result, stride2);
3806     movl(cnt2, result);
3807     negptr(result);
3808     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3809 
3810     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3811     bind(VECTOR_NOT_EQUAL);
3812     // clean upper bits of YMM registers
3813     vpxor(vec1, vec1);
3814     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3815       lea(str1, Address(str1, result, scale));
3816       lea(str2, Address(str2, result, scale));
3817     } else {
3818       lea(str1, Address(str1, result, scale1));
3819       lea(str2, Address(str2, result, scale2));
3820     }
3821     jmp(COMPARE_16_CHARS);
3822 
3823     // Compare tail chars, length between 1 to 15 chars
3824     bind(COMPARE_TAIL_LONG);
3825     movl(cnt2, result);
3826     cmpl(cnt2, stride);
3827     jcc(Assembler::less, COMPARE_SMALL_STR);
3828 
3829     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3830       movdqu(vec1, Address(str1, 0));
3831     } else {
3832       pmovzxbw(vec1, Address(str1, 0));
3833     }
3834     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3835     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3836     subptr(cnt2, stride);
3837     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3838     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3839       lea(str1, Address(str1, result, scale));
3840       lea(str2, Address(str2, result, scale));
3841     } else {
3842       lea(str1, Address(str1, result, scale1));
3843       lea(str2, Address(str2, result, scale2));
3844     }
3845     negptr(cnt2);
3846     jmpb(WHILE_HEAD_LABEL);
3847 
3848     bind(COMPARE_SMALL_STR);
3849   } else if (UseSSE42Intrinsics) {
3850     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3851     int pcmpmask = 0x19;
3852     // Setup to compare 8-char (16-byte) vectors,
3853     // start from first character again because it has aligned address.
3854     movl(result, cnt2);
3855     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3856     if (ae == StrIntrinsicNode::LL) {
3857       pcmpmask &= ~0x01;
3858     }
3859     jcc(Assembler::zero, COMPARE_TAIL);
3860     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3861       lea(str1, Address(str1, result, scale));
3862       lea(str2, Address(str2, result, scale));
3863     } else {
3864       lea(str1, Address(str1, result, scale1));
3865       lea(str2, Address(str2, result, scale2));
3866     }
3867     negptr(result);
3868 
3869     // pcmpestri
3870     //   inputs:
3871     //     vec1- substring
3872     //     rax - negative string length (elements count)
3873     //     mem - scanned string
3874     //     rdx - string length (elements count)
3875     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3876     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3877     //   outputs:
3878     //     rcx - first mismatched element index
3879     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3880 
3881     bind(COMPARE_WIDE_VECTORS);
3882     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3883       movdqu(vec1, Address(str1, result, scale));
3884       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3885     } else {
3886       pmovzxbw(vec1, Address(str1, result, scale1));
3887       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3888     }
3889     // After pcmpestri cnt1(rcx) contains mismatched element index
3890 
3891     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3892     addptr(result, stride);
3893     subptr(cnt2, stride);
3894     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3895 
3896     // compare wide vectors tail
3897     testptr(result, result);
3898     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3899 
3900     movl(cnt2, stride);
3901     movl(result, stride);
3902     negptr(result);
3903     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3904       movdqu(vec1, Address(str1, result, scale));
3905       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3906     } else {
3907       pmovzxbw(vec1, Address(str1, result, scale1));
3908       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3909     }
3910     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3911 
3912     // Mismatched characters in the vectors
3913     bind(VECTOR_NOT_EQUAL);
3914     addptr(cnt1, result);
3915     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3916     subl(result, cnt2);
3917     jmpb(POP_LABEL);
3918 
3919     bind(COMPARE_TAIL); // limit is zero
3920     movl(cnt2, result);
3921     // Fallthru to tail compare
3922   }
3923   // Shift str2 and str1 to the end of the arrays, negate min
3924   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3925     lea(str1, Address(str1, cnt2, scale));
3926     lea(str2, Address(str2, cnt2, scale));
3927   } else {
3928     lea(str1, Address(str1, cnt2, scale1));
3929     lea(str2, Address(str2, cnt2, scale2));
3930   }
3931   decrementl(cnt2);  // first character was compared already
3932   negptr(cnt2);
3933 
3934   // Compare the rest of the elements
3935   bind(WHILE_HEAD_LABEL);
3936   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3937   subl(result, cnt1);
3938   jccb(Assembler::notZero, POP_LABEL);
3939   increment(cnt2);
3940   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3941 
3942   // Strings are equal up to min length.  Return the length difference.
3943   bind(LENGTH_DIFF_LABEL);
3944   pop(result);
3945   if (ae == StrIntrinsicNode::UU) {
3946     // Divide diff by 2 to get number of chars
3947     sarl(result, 1);
3948   }
3949   jmpb(DONE_LABEL);
3950 
3951   if (VM_Version::supports_avx512vlbw()) {
3952 
3953     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3954 
3955     kmovql(cnt1, mask);
3956     notq(cnt1);
3957     bsfq(cnt2, cnt1);
3958     if (ae != StrIntrinsicNode::LL) {
3959       // Divide diff by 2 to get number of chars
3960       sarl(cnt2, 1);
3961     }
3962     addq(result, cnt2);
3963     if (ae == StrIntrinsicNode::LL) {
3964       load_unsigned_byte(cnt1, Address(str2, result));
3965       load_unsigned_byte(result, Address(str1, result));
3966     } else if (ae == StrIntrinsicNode::UU) {
3967       load_unsigned_short(cnt1, Address(str2, result, scale));
3968       load_unsigned_short(result, Address(str1, result, scale));
3969     } else {
3970       load_unsigned_short(cnt1, Address(str2, result, scale2));
3971       load_unsigned_byte(result, Address(str1, result, scale1));
3972     }
3973     subl(result, cnt1);
3974     jmpb(POP_LABEL);
3975   }//if (VM_Version::supports_avx512vlbw())
3976 
3977   // Discard the stored length difference
3978   bind(POP_LABEL);
3979   pop(cnt1);
3980 
3981   // That's it
3982   bind(DONE_LABEL);
3983   if(ae == StrIntrinsicNode::UL) {
3984     negl(result);
3985   }
3986 
3987 }
3988 
3989 // Search for Non-ASCII character (Negative byte value) in a byte array,
3990 // return the index of the first such character, otherwise the length
3991 // of the array segment searched.
3992 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3993 //   @IntrinsicCandidate
3994 //   public static int countPositives(byte[] ba, int off, int len) {
3995 //     for (int i = off; i < off + len; i++) {
3996 //       if (ba[i] < 0) {
3997 //         return i - off;
3998 //       }
3999 //     }
4000 //     return len;
4001 //   }
4002 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4003   Register result, Register tmp1,
4004   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4005   // rsi: byte array
4006   // rcx: len
4007   // rax: result
4008   ShortBranchVerifier sbv(this);
4009   assert_different_registers(ary1, len, result, tmp1);
4010   assert_different_registers(vec1, vec2);
4011   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4012 
4013   movl(result, len); // copy
4014   // len == 0
4015   testl(len, len);
4016   jcc(Assembler::zero, DONE);
4017 
4018   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4019     VM_Version::supports_avx512vlbw() &&
4020     VM_Version::supports_bmi2()) {
4021 
4022     Label test_64_loop, test_tail, BREAK_LOOP;
4023     movl(tmp1, len);
4024     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4025 
4026     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4027     andl(len,  0xffffffc0); // vector count (in chars)
4028     jccb(Assembler::zero, test_tail);
4029 
4030     lea(ary1, Address(ary1, len, Address::times_1));
4031     negptr(len);
4032 
4033     bind(test_64_loop);
4034     // Check whether our 64 elements of size byte contain negatives
4035     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4036     kortestql(mask1, mask1);
4037     jcc(Assembler::notZero, BREAK_LOOP);
4038 
4039     addptr(len, 64);
4040     jccb(Assembler::notZero, test_64_loop);
4041 
4042     bind(test_tail);
4043     // bail out when there is nothing to be done
4044     testl(tmp1, -1);
4045     jcc(Assembler::zero, DONE);
4046 
4047 
4048     // check the tail for absense of negatives
4049     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4050     {
4051       Register tmp3_aliased = len;
4052       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4053       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4054       notq(tmp3_aliased);
4055       kmovql(mask2, tmp3_aliased);
4056     }
4057 
4058     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4059     ktestq(mask1, mask2);
4060     jcc(Assembler::zero, DONE);
4061 
4062     // do a full check for negative registers in the tail
4063     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4064                      // ary1 already pointing to the right place
4065     jmpb(TAIL_START);
4066 
4067     bind(BREAK_LOOP);
4068     // At least one byte in the last 64 byte block was negative.
4069     // Set up to look at the last 64 bytes as if they were a tail
4070     lea(ary1, Address(ary1, len, Address::times_1));
4071     addptr(result, len);
4072     // Ignore the very last byte: if all others are positive,
4073     // it must be negative, so we can skip right to the 2+1 byte
4074     // end comparison at this point
4075     orl(result, 63);
4076     movl(len, 63);
4077     // Fallthru to tail compare
4078   } else {
4079 
4080     if (UseAVX >= 2) {
4081       // With AVX2, use 32-byte vector compare
4082       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4083 
4084       // Compare 32-byte vectors
4085       testl(len, 0xffffffe0);   // vector count (in bytes)
4086       jccb(Assembler::zero, TAIL_START);
4087 
4088       andl(len, 0xffffffe0);
4089       lea(ary1, Address(ary1, len, Address::times_1));
4090       negptr(len);
4091 
4092       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4093       movdl(vec2, tmp1);
4094       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4095 
4096       bind(COMPARE_WIDE_VECTORS);
4097       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4098       vptest(vec1, vec2);
4099       jccb(Assembler::notZero, BREAK_LOOP);
4100       addptr(len, 32);
4101       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4102 
4103       testl(result, 0x0000001f);   // any bytes remaining?
4104       jcc(Assembler::zero, DONE);
4105 
4106       // Quick test using the already prepared vector mask
4107       movl(len, result);
4108       andl(len, 0x0000001f);
4109       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4110       vptest(vec1, vec2);
4111       jcc(Assembler::zero, DONE);
4112       // There are zeros, jump to the tail to determine exactly where
4113       jmpb(TAIL_START);
4114 
4115       bind(BREAK_LOOP);
4116       // At least one byte in the last 32-byte vector is negative.
4117       // Set up to look at the last 32 bytes as if they were a tail
4118       lea(ary1, Address(ary1, len, Address::times_1));
4119       addptr(result, len);
4120       // Ignore the very last byte: if all others are positive,
4121       // it must be negative, so we can skip right to the 2+1 byte
4122       // end comparison at this point
4123       orl(result, 31);
4124       movl(len, 31);
4125       // Fallthru to tail compare
4126     } else if (UseSSE42Intrinsics) {
4127       // With SSE4.2, use double quad vector compare
4128       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4129 
4130       // Compare 16-byte vectors
4131       testl(len, 0xfffffff0);   // vector count (in bytes)
4132       jcc(Assembler::zero, TAIL_START);
4133 
4134       andl(len, 0xfffffff0);
4135       lea(ary1, Address(ary1, len, Address::times_1));
4136       negptr(len);
4137 
4138       movl(tmp1, 0x80808080);
4139       movdl(vec2, tmp1);
4140       pshufd(vec2, vec2, 0);
4141 
4142       bind(COMPARE_WIDE_VECTORS);
4143       movdqu(vec1, Address(ary1, len, Address::times_1));
4144       ptest(vec1, vec2);
4145       jccb(Assembler::notZero, BREAK_LOOP);
4146       addptr(len, 16);
4147       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4148 
4149       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4150       jcc(Assembler::zero, DONE);
4151 
4152       // Quick test using the already prepared vector mask
4153       movl(len, result);
4154       andl(len, 0x0000000f);   // tail count (in bytes)
4155       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4156       ptest(vec1, vec2);
4157       jcc(Assembler::zero, DONE);
4158       jmpb(TAIL_START);
4159 
4160       bind(BREAK_LOOP);
4161       // At least one byte in the last 16-byte vector is negative.
4162       // Set up and look at the last 16 bytes as if they were a tail
4163       lea(ary1, Address(ary1, len, Address::times_1));
4164       addptr(result, len);
4165       // Ignore the very last byte: if all others are positive,
4166       // it must be negative, so we can skip right to the 2+1 byte
4167       // end comparison at this point
4168       orl(result, 15);
4169       movl(len, 15);
4170       // Fallthru to tail compare
4171     }
4172   }
4173 
4174   bind(TAIL_START);
4175   // Compare 4-byte vectors
4176   andl(len, 0xfffffffc); // vector count (in bytes)
4177   jccb(Assembler::zero, COMPARE_CHAR);
4178 
4179   lea(ary1, Address(ary1, len, Address::times_1));
4180   negptr(len);
4181 
4182   bind(COMPARE_VECTORS);
4183   movl(tmp1, Address(ary1, len, Address::times_1));
4184   andl(tmp1, 0x80808080);
4185   jccb(Assembler::notZero, TAIL_ADJUST);
4186   addptr(len, 4);
4187   jccb(Assembler::notZero, COMPARE_VECTORS);
4188 
4189   // Compare trailing char (final 2-3 bytes), if any
4190   bind(COMPARE_CHAR);
4191 
4192   testl(result, 0x2);   // tail  char
4193   jccb(Assembler::zero, COMPARE_BYTE);
4194   load_unsigned_short(tmp1, Address(ary1, 0));
4195   andl(tmp1, 0x00008080);
4196   jccb(Assembler::notZero, CHAR_ADJUST);
4197   lea(ary1, Address(ary1, 2));
4198 
4199   bind(COMPARE_BYTE);
4200   testl(result, 0x1);   // tail  byte
4201   jccb(Assembler::zero, DONE);
4202   load_unsigned_byte(tmp1, Address(ary1, 0));
4203   testl(tmp1, 0x00000080);
4204   jccb(Assembler::zero, DONE);
4205   subptr(result, 1);
4206   jmpb(DONE);
4207 
4208   bind(TAIL_ADJUST);
4209   // there are negative bits in the last 4 byte block.
4210   // Adjust result and check the next three bytes
4211   addptr(result, len);
4212   orl(result, 3);
4213   lea(ary1, Address(ary1, len, Address::times_1));
4214   jmpb(COMPARE_CHAR);
4215 
4216   bind(CHAR_ADJUST);
4217   // We are looking at a char + optional byte tail, and found that one
4218   // of the bytes in the char is negative. Adjust the result, check the
4219   // first byte and readjust if needed.
4220   andl(result, 0xfffffffc);
4221   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4222   jccb(Assembler::notZero, DONE);
4223   addptr(result, 1);
4224 
4225   // That's it
4226   bind(DONE);
4227   if (UseAVX >= 2) {
4228     // clean upper bits of YMM registers
4229     vpxor(vec1, vec1);
4230     vpxor(vec2, vec2);
4231   }
4232 }
4233 
4234 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4235 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4236                                       Register limit, Register result, Register chr,
4237                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4238                                       KRegister mask, bool expand_ary2) {
4239   // for expand_ary2, limit is the (smaller) size of the second array.
4240   ShortBranchVerifier sbv(this);
4241   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4242 
4243   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4244          "Expansion only implemented for AVX2");
4245 
4246   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4247   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4248 
4249   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4250   int scaleIncr = expand_ary2 ? 8 : 16;
4251 
4252   if (is_array_equ) {
4253     // Check the input args
4254     cmpoop(ary1, ary2);
4255     jcc(Assembler::equal, TRUE_LABEL);
4256 
4257     // Need additional checks for arrays_equals.
4258     testptr(ary1, ary1);
4259     jcc(Assembler::zero, FALSE_LABEL);
4260     testptr(ary2, ary2);
4261     jcc(Assembler::zero, FALSE_LABEL);
4262 
4263     // Check the lengths
4264     movl(limit, Address(ary1, length_offset));
4265     cmpl(limit, Address(ary2, length_offset));
4266     jcc(Assembler::notEqual, FALSE_LABEL);
4267   }
4268 
4269   // count == 0
4270   testl(limit, limit);
4271   jcc(Assembler::zero, TRUE_LABEL);
4272 
4273   if (is_array_equ) {
4274     // Load array address
4275     lea(ary1, Address(ary1, base_offset));
4276     lea(ary2, Address(ary2, base_offset));
4277   }
4278 
4279   if (is_array_equ && is_char) {
4280     // arrays_equals when used for char[].
4281     shll(limit, 1);      // byte count != 0
4282   }
4283   movl(result, limit); // copy
4284 
4285   if (UseAVX >= 2) {
4286     // With AVX2, use 32-byte vector compare
4287     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4288 
4289     // Compare 32-byte vectors
4290     if (expand_ary2) {
4291       andl(result, 0x0000000f);  //   tail count (in bytes)
4292       andl(limit, 0xfffffff0);   // vector count (in bytes)
4293       jcc(Assembler::zero, COMPARE_TAIL);
4294     } else {
4295       andl(result, 0x0000001f);  //   tail count (in bytes)
4296       andl(limit, 0xffffffe0);   // vector count (in bytes)
4297       jcc(Assembler::zero, COMPARE_TAIL_16);
4298     }
4299 
4300     lea(ary1, Address(ary1, limit, scaleFactor));
4301     lea(ary2, Address(ary2, limit, Address::times_1));
4302     negptr(limit);
4303 
4304     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4305       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4306 
4307       cmpl(limit, -64);
4308       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4309 
4310       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4311 
4312       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4313       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4314       kortestql(mask, mask);
4315       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4316       addptr(limit, 64);  // update since we already compared at this addr
4317       cmpl(limit, -64);
4318       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4319 
4320       // At this point we may still need to compare -limit+result bytes.
4321       // We could execute the next two instruction and just continue via non-wide path:
4322       //  cmpl(limit, 0);
4323       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4324       // But since we stopped at the points ary{1,2}+limit which are
4325       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4326       // (|limit| <= 32 and result < 32),
4327       // we may just compare the last 64 bytes.
4328       //
4329       addptr(result, -64);   // it is safe, bc we just came from this area
4330       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4331       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4332       kortestql(mask, mask);
4333       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4334 
4335       jmp(TRUE_LABEL);
4336 
4337       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4338 
4339     }//if (VM_Version::supports_avx512vlbw())
4340 
4341     bind(COMPARE_WIDE_VECTORS);
4342     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4343     if (expand_ary2) {
4344       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4345     } else {
4346       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4347     }
4348     vpxor(vec1, vec2);
4349 
4350     vptest(vec1, vec1);
4351     jcc(Assembler::notZero, FALSE_LABEL);
4352     addptr(limit, scaleIncr * 2);
4353     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4354 
4355     testl(result, result);
4356     jcc(Assembler::zero, TRUE_LABEL);
4357 
4358     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4359     if (expand_ary2) {
4360       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4361     } else {
4362       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4363     }
4364     vpxor(vec1, vec2);
4365 
4366     vptest(vec1, vec1);
4367     jcc(Assembler::notZero, FALSE_LABEL);
4368     jmp(TRUE_LABEL);
4369 
4370     bind(COMPARE_TAIL_16); // limit is zero
4371     movl(limit, result);
4372 
4373     // Compare 16-byte chunks
4374     andl(result, 0x0000000f);  //   tail count (in bytes)
4375     andl(limit, 0xfffffff0);   // vector count (in bytes)
4376     jcc(Assembler::zero, COMPARE_TAIL);
4377 
4378     lea(ary1, Address(ary1, limit, scaleFactor));
4379     lea(ary2, Address(ary2, limit, Address::times_1));
4380     negptr(limit);
4381 
4382     bind(COMPARE_WIDE_VECTORS_16);
4383     movdqu(vec1, Address(ary1, limit, scaleFactor));
4384     if (expand_ary2) {
4385       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4386     } else {
4387       movdqu(vec2, Address(ary2, limit, Address::times_1));
4388     }
4389     pxor(vec1, vec2);
4390 
4391     ptest(vec1, vec1);
4392     jcc(Assembler::notZero, FALSE_LABEL);
4393     addptr(limit, scaleIncr);
4394     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4395 
4396     bind(COMPARE_TAIL); // limit is zero
4397     movl(limit, result);
4398     // Fallthru to tail compare
4399   } else if (UseSSE42Intrinsics) {
4400     // With SSE4.2, use double quad vector compare
4401     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4402 
4403     // Compare 16-byte vectors
4404     andl(result, 0x0000000f);  //   tail count (in bytes)
4405     andl(limit, 0xfffffff0);   // vector count (in bytes)
4406     jcc(Assembler::zero, COMPARE_TAIL);
4407 
4408     lea(ary1, Address(ary1, limit, Address::times_1));
4409     lea(ary2, Address(ary2, limit, Address::times_1));
4410     negptr(limit);
4411 
4412     bind(COMPARE_WIDE_VECTORS);
4413     movdqu(vec1, Address(ary1, limit, Address::times_1));
4414     movdqu(vec2, Address(ary2, limit, Address::times_1));
4415     pxor(vec1, vec2);
4416 
4417     ptest(vec1, vec1);
4418     jcc(Assembler::notZero, FALSE_LABEL);
4419     addptr(limit, 16);
4420     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4421 
4422     testl(result, result);
4423     jcc(Assembler::zero, TRUE_LABEL);
4424 
4425     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4426     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4427     pxor(vec1, vec2);
4428 
4429     ptest(vec1, vec1);
4430     jccb(Assembler::notZero, FALSE_LABEL);
4431     jmpb(TRUE_LABEL);
4432 
4433     bind(COMPARE_TAIL); // limit is zero
4434     movl(limit, result);
4435     // Fallthru to tail compare
4436   }
4437 
4438   // Compare 4-byte vectors
4439   if (expand_ary2) {
4440     testl(result, result);
4441     jccb(Assembler::zero, TRUE_LABEL);
4442   } else {
4443     andl(limit, 0xfffffffc); // vector count (in bytes)
4444     jccb(Assembler::zero, COMPARE_CHAR);
4445   }
4446 
4447   lea(ary1, Address(ary1, limit, scaleFactor));
4448   lea(ary2, Address(ary2, limit, Address::times_1));
4449   negptr(limit);
4450 
4451   bind(COMPARE_VECTORS);
4452   if (expand_ary2) {
4453     // There are no "vector" operations for bytes to shorts
4454     movzbl(chr, Address(ary2, limit, Address::times_1));
4455     cmpw(Address(ary1, limit, Address::times_2), chr);
4456     jccb(Assembler::notEqual, FALSE_LABEL);
4457     addptr(limit, 1);
4458     jcc(Assembler::notZero, COMPARE_VECTORS);
4459     jmp(TRUE_LABEL);
4460   } else {
4461     movl(chr, Address(ary1, limit, Address::times_1));
4462     cmpl(chr, Address(ary2, limit, Address::times_1));
4463     jccb(Assembler::notEqual, FALSE_LABEL);
4464     addptr(limit, 4);
4465     jcc(Assembler::notZero, COMPARE_VECTORS);
4466   }
4467 
4468   // Compare trailing char (final 2 bytes), if any
4469   bind(COMPARE_CHAR);
4470   testl(result, 0x2);   // tail  char
4471   jccb(Assembler::zero, COMPARE_BYTE);
4472   load_unsigned_short(chr, Address(ary1, 0));
4473   load_unsigned_short(limit, Address(ary2, 0));
4474   cmpl(chr, limit);
4475   jccb(Assembler::notEqual, FALSE_LABEL);
4476 
4477   if (is_array_equ && is_char) {
4478     bind(COMPARE_BYTE);
4479   } else {
4480     lea(ary1, Address(ary1, 2));
4481     lea(ary2, Address(ary2, 2));
4482 
4483     bind(COMPARE_BYTE);
4484     testl(result, 0x1);   // tail  byte
4485     jccb(Assembler::zero, TRUE_LABEL);
4486     load_unsigned_byte(chr, Address(ary1, 0));
4487     load_unsigned_byte(limit, Address(ary2, 0));
4488     cmpl(chr, limit);
4489     jccb(Assembler::notEqual, FALSE_LABEL);
4490   }
4491   bind(TRUE_LABEL);
4492   movl(result, 1);   // return true
4493   jmpb(DONE);
4494 
4495   bind(FALSE_LABEL);
4496   xorl(result, result); // return false
4497 
4498   // That's it
4499   bind(DONE);
4500   if (UseAVX >= 2) {
4501     // clean upper bits of YMM registers
4502     vpxor(vec1, vec1);
4503     vpxor(vec2, vec2);
4504   }
4505 }
4506 
4507 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4508 #define __ masm.
4509   Register dst = stub.data<0>();
4510   XMMRegister src = stub.data<1>();
4511   address target = stub.data<2>();
4512   __ bind(stub.entry());
4513   __ subptr(rsp, 8);
4514   __ movdbl(Address(rsp), src);
4515   __ call(RuntimeAddress(target));
4516   // APX REX2 encoding for pop(dst) increases the stub size by 1 byte.
4517   __ pop(dst);
4518   __ jmp(stub.continuation());
4519 #undef __
4520 }
4521 
4522 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4523   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4524   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4525 
4526   address slowpath_target;
4527   if (dst_bt == T_INT) {
4528     if (src_bt == T_FLOAT) {
4529       cvttss2sil(dst, src);
4530       cmpl(dst, 0x80000000);
4531       slowpath_target = StubRoutines::x86::f2i_fixup();
4532     } else {
4533       cvttsd2sil(dst, src);
4534       cmpl(dst, 0x80000000);
4535       slowpath_target = StubRoutines::x86::d2i_fixup();
4536     }
4537   } else {
4538     if (src_bt == T_FLOAT) {
4539       cvttss2siq(dst, src);
4540       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4541       slowpath_target = StubRoutines::x86::f2l_fixup();
4542     } else {
4543       cvttsd2siq(dst, src);
4544       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4545       slowpath_target = StubRoutines::x86::d2l_fixup();
4546     }
4547   }
4548 
4549   // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte.
4550   int max_size = 23 + (UseAPX ? 1 : 0);
4551   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath);
4552   jcc(Assembler::equal, stub->entry());
4553   bind(stub->continuation());
4554 }
4555 
4556 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4557                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4558   switch(ideal_opc) {
4559     case Op_LShiftVS:
4560       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4561     case Op_LShiftVI:
4562       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4563     case Op_LShiftVL:
4564       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4565     case Op_RShiftVS:
4566       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4567     case Op_RShiftVI:
4568       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4569     case Op_RShiftVL:
4570       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4571     case Op_URShiftVS:
4572       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4573     case Op_URShiftVI:
4574       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4575     case Op_URShiftVL:
4576       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4577     case Op_RotateRightV:
4578       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4579     case Op_RotateLeftV:
4580       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4581     default:
4582       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4583       break;
4584   }
4585 }
4586 
4587 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4588                                                XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4589   if (is_unsigned) {
4590     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4591   } else {
4592     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4593   }
4594 }
4595 
4596 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4597                                                       XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4598   switch (elem_bt) {
4599     case T_BYTE:
4600       if (ideal_opc == Op_SaturatingAddV) {
4601         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4602       } else {
4603         assert(ideal_opc == Op_SaturatingSubV, "");
4604         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4605       }
4606       break;
4607     case T_SHORT:
4608       if (ideal_opc == Op_SaturatingAddV) {
4609         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4610       } else {
4611         assert(ideal_opc == Op_SaturatingSubV, "");
4612         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4613       }
4614       break;
4615     default:
4616       fatal("Unsupported type %s", type2name(elem_bt));
4617       break;
4618   }
4619 }
4620 
4621 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4622                                                         XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4623   switch (elem_bt) {
4624     case T_BYTE:
4625       if (ideal_opc == Op_SaturatingAddV) {
4626         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4627       } else {
4628         assert(ideal_opc == Op_SaturatingSubV, "");
4629         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4630       }
4631       break;
4632     case T_SHORT:
4633       if (ideal_opc == Op_SaturatingAddV) {
4634         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4635       } else {
4636         assert(ideal_opc == Op_SaturatingSubV, "");
4637         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4638       }
4639       break;
4640     default:
4641       fatal("Unsupported type %s", type2name(elem_bt));
4642       break;
4643   }
4644 }
4645 
4646 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4647                                                Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4648   if (is_unsigned) {
4649     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4650   } else {
4651     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4652   }
4653 }
4654 
4655 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4656                                                       XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4657   switch (elem_bt) {
4658     case T_BYTE:
4659       if (ideal_opc == Op_SaturatingAddV) {
4660         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4661       } else {
4662         assert(ideal_opc == Op_SaturatingSubV, "");
4663         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4664       }
4665       break;
4666     case T_SHORT:
4667       if (ideal_opc == Op_SaturatingAddV) {
4668         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4669       } else {
4670         assert(ideal_opc == Op_SaturatingSubV, "");
4671         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4672       }
4673       break;
4674     default:
4675       fatal("Unsupported type %s", type2name(elem_bt));
4676       break;
4677   }
4678 }
4679 
4680 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4681                                                         XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4682   switch (elem_bt) {
4683     case T_BYTE:
4684       if (ideal_opc == Op_SaturatingAddV) {
4685         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4686       } else {
4687         assert(ideal_opc == Op_SaturatingSubV, "");
4688         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4689       }
4690       break;
4691     case T_SHORT:
4692       if (ideal_opc == Op_SaturatingAddV) {
4693         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4694       } else {
4695         assert(ideal_opc == Op_SaturatingSubV, "");
4696         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4697       }
4698       break;
4699     default:
4700       fatal("Unsupported type %s", type2name(elem_bt));
4701       break;
4702   }
4703 }
4704 
4705 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4706                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4707                                     bool is_varshift) {
4708   switch (ideal_opc) {
4709     case Op_AddVB:
4710       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4711     case Op_AddVS:
4712       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4713     case Op_AddVI:
4714       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4715     case Op_AddVL:
4716       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4717     case Op_AddVF:
4718       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4719     case Op_AddVD:
4720       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4721     case Op_SubVB:
4722       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4723     case Op_SubVS:
4724       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4725     case Op_SubVI:
4726       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4727     case Op_SubVL:
4728       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4729     case Op_SubVF:
4730       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4731     case Op_SubVD:
4732       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4733     case Op_MulVS:
4734       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4735     case Op_MulVI:
4736       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4737     case Op_MulVL:
4738       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4739     case Op_MulVF:
4740       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4741     case Op_MulVD:
4742       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4743     case Op_DivVF:
4744       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4745     case Op_DivVD:
4746       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4747     case Op_SqrtVF:
4748       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4749     case Op_SqrtVD:
4750       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4751     case Op_AbsVB:
4752       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4753     case Op_AbsVS:
4754       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4755     case Op_AbsVI:
4756       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4757     case Op_AbsVL:
4758       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4759     case Op_FmaVF:
4760       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4761     case Op_FmaVD:
4762       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4763     case Op_VectorRearrange:
4764       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4765     case Op_LShiftVS:
4766       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4767     case Op_LShiftVI:
4768       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4769     case Op_LShiftVL:
4770       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4771     case Op_RShiftVS:
4772       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4773     case Op_RShiftVI:
4774       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4775     case Op_RShiftVL:
4776       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4777     case Op_URShiftVS:
4778       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4779     case Op_URShiftVI:
4780       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4781     case Op_URShiftVL:
4782       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4783     case Op_RotateLeftV:
4784       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4785     case Op_RotateRightV:
4786       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4787     case Op_MaxV:
4788       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4789     case Op_MinV:
4790       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4791     case Op_UMinV:
4792       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4793     case Op_UMaxV:
4794       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4795     case Op_XorV:
4796       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4797     case Op_OrV:
4798       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4799     case Op_AndV:
4800       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4801     default:
4802       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4803       break;
4804   }
4805 }
4806 
4807 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4808                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4809   switch (ideal_opc) {
4810     case Op_AddVB:
4811       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4812     case Op_AddVS:
4813       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4814     case Op_AddVI:
4815       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4816     case Op_AddVL:
4817       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4818     case Op_AddVF:
4819       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4820     case Op_AddVD:
4821       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4822     case Op_SubVB:
4823       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4824     case Op_SubVS:
4825       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4826     case Op_SubVI:
4827       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4828     case Op_SubVL:
4829       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4830     case Op_SubVF:
4831       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4832     case Op_SubVD:
4833       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4834     case Op_MulVS:
4835       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4836     case Op_MulVI:
4837       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4838     case Op_MulVL:
4839       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4840     case Op_MulVF:
4841       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4842     case Op_MulVD:
4843       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4844     case Op_DivVF:
4845       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4846     case Op_DivVD:
4847       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4848     case Op_FmaVF:
4849       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4850     case Op_FmaVD:
4851       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4852     case Op_MaxV:
4853       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4854     case Op_MinV:
4855       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4856     case Op_UMaxV:
4857       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4858     case Op_UMinV:
4859       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4860     case Op_XorV:
4861       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4862     case Op_OrV:
4863       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4864     case Op_AndV:
4865       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4866     default:
4867       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4868       break;
4869   }
4870 }
4871 
4872 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4873                                   KRegister src1, KRegister src2) {
4874   BasicType etype = T_ILLEGAL;
4875   switch(mask_len) {
4876     case 2:
4877     case 4:
4878     case 8:  etype = T_BYTE; break;
4879     case 16: etype = T_SHORT; break;
4880     case 32: etype = T_INT; break;
4881     case 64: etype = T_LONG; break;
4882     default: fatal("Unsupported type"); break;
4883   }
4884   assert(etype != T_ILLEGAL, "");
4885   switch(ideal_opc) {
4886     case Op_AndVMask:
4887       kand(etype, dst, src1, src2); break;
4888     case Op_OrVMask:
4889       kor(etype, dst, src1, src2); break;
4890     case Op_XorVMask:
4891       kxor(etype, dst, src1, src2); break;
4892     default:
4893       fatal("Unsupported masked operation"); break;
4894   }
4895 }
4896 
4897 /*
4898  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4899  * If src is NaN, the result is 0.
4900  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4901  * the result is equal to the value of Integer.MIN_VALUE.
4902  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4903  * the result is equal to the value of Integer.MAX_VALUE.
4904  */
4905 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4906                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4907                                                                    Register rscratch, AddressLiteral float_sign_flip,
4908                                                                    int vec_enc) {
4909   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4910   Label done;
4911   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4912   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4913   vptest(xtmp2, xtmp2, vec_enc);
4914   jccb(Assembler::equal, done);
4915 
4916   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4917   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4918 
4919   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4920   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4921   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4922 
4923   // Recompute the mask for remaining special value.
4924   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4925   // Extract SRC values corresponding to TRUE mask lanes.
4926   vpand(xtmp4, xtmp2, src, vec_enc);
4927   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4928   // values are set.
4929   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4930 
4931   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4932   bind(done);
4933 }
4934 
4935 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4936                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4937                                                                     Register rscratch, AddressLiteral float_sign_flip,
4938                                                                     int vec_enc) {
4939   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4940   Label done;
4941   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4942   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4943   kortestwl(ktmp1, ktmp1);
4944   jccb(Assembler::equal, done);
4945 
4946   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4947   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4948   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4949 
4950   kxorwl(ktmp1, ktmp1, ktmp2);
4951   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4952   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4953   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4954   bind(done);
4955 }
4956 
4957 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4958                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4959                                                                      Register rscratch, AddressLiteral double_sign_flip,
4960                                                                      int vec_enc) {
4961   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4962 
4963   Label done;
4964   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4965   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4966   kortestwl(ktmp1, ktmp1);
4967   jccb(Assembler::equal, done);
4968 
4969   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4970   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4971   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4972 
4973   kxorwl(ktmp1, ktmp1, ktmp2);
4974   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4975   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4976   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4977   bind(done);
4978 }
4979 
4980 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4981                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4982                                                                      Register rscratch, AddressLiteral float_sign_flip,
4983                                                                      int vec_enc) {
4984   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4985   Label done;
4986   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4987   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4988   kortestwl(ktmp1, ktmp1);
4989   jccb(Assembler::equal, done);
4990 
4991   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4992   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4993   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4994 
4995   kxorwl(ktmp1, ktmp1, ktmp2);
4996   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4997   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4998   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4999   bind(done);
5000 }
5001 
5002 /*
5003  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5004  * If src is NaN, the result is 0.
5005  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5006  * the result is equal to the value of Long.MIN_VALUE.
5007  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5008  * the result is equal to the value of Long.MAX_VALUE.
5009  */
5010 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5011                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5012                                                                       Register rscratch, AddressLiteral double_sign_flip,
5013                                                                       int vec_enc) {
5014   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5015 
5016   Label done;
5017   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5018   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5019   kortestwl(ktmp1, ktmp1);
5020   jccb(Assembler::equal, done);
5021 
5022   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5023   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5024   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5025 
5026   kxorwl(ktmp1, ktmp1, ktmp2);
5027   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5028   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5029   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5030   bind(done);
5031 }
5032 
5033 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5034                                                              XMMRegister xtmp, int index, int vec_enc) {
5035    assert(vec_enc < Assembler::AVX_512bit, "");
5036    if (vec_enc == Assembler::AVX_256bit) {
5037      vextractf128_high(xtmp, src);
5038      vshufps(dst, src, xtmp, index, vec_enc);
5039    } else {
5040      vshufps(dst, src, zero, index, vec_enc);
5041    }
5042 }
5043 
5044 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5045                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5046                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5047   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5048 
5049   Label done;
5050   // Compare the destination lanes with float_sign_flip
5051   // value to get mask for all special values.
5052   movdqu(xtmp1, float_sign_flip, rscratch);
5053   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5054   ptest(xtmp2, xtmp2);
5055   jccb(Assembler::equal, done);
5056 
5057   // Flip float_sign_flip to get max integer value.
5058   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5059   pxor(xtmp1, xtmp4);
5060 
5061   // Set detination lanes corresponding to unordered source lanes as zero.
5062   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5063   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5064 
5065   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5066   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5067   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5068 
5069   // Recompute the mask for remaining special value.
5070   pxor(xtmp2, xtmp3);
5071   // Extract mask corresponding to non-negative source lanes.
5072   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5073 
5074   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5075   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5076   pand(xtmp3, xtmp2);
5077 
5078   // Replace destination lanes holding special value(0x80000000) with max int
5079   // if corresponding source lane holds a +ve value.
5080   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5081   bind(done);
5082 }
5083 
5084 
5085 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5086                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5087   switch(to_elem_bt) {
5088     case T_SHORT:
5089       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5090       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5091       vpackusdw(dst, dst, zero, vec_enc);
5092       if (vec_enc == Assembler::AVX_256bit) {
5093         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5094       }
5095       break;
5096     case  T_BYTE:
5097       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5098       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5099       vpackusdw(dst, dst, zero, vec_enc);
5100       if (vec_enc == Assembler::AVX_256bit) {
5101         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5102       }
5103       vpackuswb(dst, dst, zero, vec_enc);
5104       break;
5105     default: assert(false, "Unexpected basic type for target of vector cast int to subword: %s", type2name(to_elem_bt));
5106   }
5107 }
5108 
5109 /*
5110  * Algorithm for vector D2L and F2I conversions (AVX 10.2 unsupported):-
5111  * a) Perform vector D2L/F2I cast.
5112  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5113  *    It signifies that source value could be any of the special floating point
5114  *    values(NaN,-Inf,Inf,Max,-Min).
5115  * c) Set destination to zero if source is NaN value.
5116  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5117  */
5118 
5119 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5120                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5121                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5122   int to_elem_sz = type2aelembytes(to_elem_bt);
5123   assert(to_elem_sz <= 4, "");
5124   vcvttps2dq(dst, src, vec_enc);
5125   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5126   if (to_elem_sz < 4) {
5127     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5128     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5129   }
5130 }
5131 
5132 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5133                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5134                                             Register rscratch, int vec_enc) {
5135   int to_elem_sz = type2aelembytes(to_elem_bt);
5136   assert(to_elem_sz <= 4, "");
5137   vcvttps2dq(dst, src, vec_enc);
5138   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5139   switch(to_elem_bt) {
5140     case T_INT:
5141       break;
5142     case T_SHORT:
5143       evpmovdw(dst, dst, vec_enc);
5144       break;
5145     case T_BYTE:
5146       evpmovdb(dst, dst, vec_enc);
5147       break;
5148     default: assert(false, "Unexpected basic type for target of vector castF2X EVEX: %s", type2name(to_elem_bt));
5149   }
5150 }
5151 
5152 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5153                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5154                                             Register rscratch, int vec_enc) {
5155   evcvttps2qq(dst, src, vec_enc);
5156   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5157 }
5158 
5159 // Handling for downcasting from double to integer or sub-word types on AVX2.
5160 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5161                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5162                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5163   int to_elem_sz = type2aelembytes(to_elem_bt);
5164   assert(to_elem_sz < 8, "");
5165   vcvttpd2dq(dst, src, vec_enc);
5166   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5167                                               float_sign_flip, vec_enc);
5168   if (to_elem_sz < 4) {
5169     // xtmp4 holds all zero lanes.
5170     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5171   }
5172 }
5173 
5174 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5175                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5176                                             KRegister ktmp2, AddressLiteral sign_flip,
5177                                             Register rscratch, int vec_enc) {
5178   if (VM_Version::supports_avx512dq()) {
5179     evcvttpd2qq(dst, src, vec_enc);
5180     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5181     switch(to_elem_bt) {
5182       case T_LONG:
5183         break;
5184       case T_INT:
5185         evpmovsqd(dst, dst, vec_enc);
5186         break;
5187       case T_SHORT:
5188         evpmovsqd(dst, dst, vec_enc);
5189         evpmovdw(dst, dst, vec_enc);
5190         break;
5191       case T_BYTE:
5192         evpmovsqd(dst, dst, vec_enc);
5193         evpmovdb(dst, dst, vec_enc);
5194         break;
5195       default: assert(false, "Unexpected basic type for target of vector castD2X AVX512DQ EVEX: %s", type2name(to_elem_bt));
5196     }
5197   } else {
5198     assert(type2aelembytes(to_elem_bt) <= 4, "");
5199     vcvttpd2dq(dst, src, vec_enc);
5200     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5201     switch(to_elem_bt) {
5202       case T_INT:
5203         break;
5204       case T_SHORT:
5205         evpmovdw(dst, dst, vec_enc);
5206         break;
5207       case T_BYTE:
5208         evpmovdb(dst, dst, vec_enc);
5209         break;
5210       default: assert(false, "Unexpected basic type for target of vector castD2X EVEX: %s", type2name(to_elem_bt));
5211     }
5212   }
5213 }
5214 
5215 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5216   switch(to_elem_bt) {
5217     case T_LONG:
5218       evcvttps2qqs(dst, src, vec_enc);
5219       break;
5220     case T_INT:
5221       evcvttps2dqs(dst, src, vec_enc);
5222       break;
5223     case T_SHORT:
5224       evcvttps2dqs(dst, src, vec_enc);
5225       evpmovdw(dst, dst, vec_enc);
5226       break;
5227     case T_BYTE:
5228       evcvttps2dqs(dst, src, vec_enc);
5229       evpmovdb(dst, dst, vec_enc);
5230       break;
5231     default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (reg src): %s", type2name(to_elem_bt));
5232   }
5233 }
5234 
5235 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5236   switch(to_elem_bt) {
5237     case T_LONG:
5238       evcvttps2qqs(dst, src, vec_enc);
5239       break;
5240     case T_INT:
5241       evcvttps2dqs(dst, src, vec_enc);
5242       break;
5243     case T_SHORT:
5244       evcvttps2dqs(dst, src, vec_enc);
5245       evpmovdw(dst, dst, vec_enc);
5246       break;
5247     case T_BYTE:
5248       evcvttps2dqs(dst, src, vec_enc);
5249       evpmovdb(dst, dst, vec_enc);
5250       break;
5251     default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (mem src): %s", type2name(to_elem_bt));
5252   }
5253 }
5254 
5255 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5256   switch(to_elem_bt) {
5257     case T_LONG:
5258       evcvttpd2qqs(dst, src, vec_enc);
5259       break;
5260     case T_INT:
5261       evcvttpd2dqs(dst, src, vec_enc);
5262       break;
5263     case T_SHORT:
5264       evcvttpd2dqs(dst, src, vec_enc);
5265       evpmovdw(dst, dst, vec_enc);
5266       break;
5267     case T_BYTE:
5268       evcvttpd2dqs(dst, src, vec_enc);
5269       evpmovdb(dst, dst, vec_enc);
5270       break;
5271     default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (reg src): %s", type2name(to_elem_bt));
5272   }
5273 }
5274 
5275 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5276   switch(to_elem_bt) {
5277     case T_LONG:
5278       evcvttpd2qqs(dst, src, vec_enc);
5279       break;
5280     case T_INT:
5281       evcvttpd2dqs(dst, src, vec_enc);
5282       break;
5283     case T_SHORT:
5284       evcvttpd2dqs(dst, src, vec_enc);
5285       evpmovdw(dst, dst, vec_enc);
5286       break;
5287     case T_BYTE:
5288       evcvttpd2dqs(dst, src, vec_enc);
5289       evpmovdb(dst, dst, vec_enc);
5290       break;
5291     default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (mem src): %s", type2name(to_elem_bt));
5292   }
5293 }
5294 
5295 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5296                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5297                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5298   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5299   // and re-instantiate original MXCSR.RC mode after that.
5300   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5301 
5302   mov64(tmp, julong_cast(0.5L));
5303   evpbroadcastq(xtmp1, tmp, vec_enc);
5304   vaddpd(xtmp1, src , xtmp1, vec_enc);
5305   evcvtpd2qq(dst, xtmp1, vec_enc);
5306   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5307                                                 double_sign_flip, vec_enc);;
5308 
5309   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5310 }
5311 
5312 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5313                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5314                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5315   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5316   // and re-instantiate original MXCSR.RC mode after that.
5317   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5318 
5319   movl(tmp, jint_cast(0.5));
5320   movq(xtmp1, tmp);
5321   vbroadcastss(xtmp1, xtmp1, vec_enc);
5322   vaddps(xtmp1, src , xtmp1, vec_enc);
5323   vcvtps2dq(dst, xtmp1, vec_enc);
5324   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5325                                               float_sign_flip, vec_enc);
5326 
5327   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5328 }
5329 
5330 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5331                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5332                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5333   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5334   // and re-instantiate original MXCSR.RC mode after that.
5335   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5336 
5337   movl(tmp, jint_cast(0.5));
5338   movq(xtmp1, tmp);
5339   vbroadcastss(xtmp1, xtmp1, vec_enc);
5340   vaddps(xtmp1, src , xtmp1, vec_enc);
5341   vcvtps2dq(dst, xtmp1, vec_enc);
5342   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5343 
5344   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5345 }
5346 
5347 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5348                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5349   switch (from_elem_bt) {
5350     case T_BYTE:
5351       switch (to_elem_bt) {
5352         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5353         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5354         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5355         default: ShouldNotReachHere();
5356       }
5357       break;
5358     case T_SHORT:
5359       switch (to_elem_bt) {
5360         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5361         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5362         default: ShouldNotReachHere();
5363       }
5364       break;
5365     case T_INT:
5366       assert(to_elem_bt == T_LONG, "");
5367       vpmovzxdq(dst, src, vlen_enc);
5368       break;
5369     default:
5370       ShouldNotReachHere();
5371   }
5372 }
5373 
5374 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5375                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5376   switch (from_elem_bt) {
5377     case T_BYTE:
5378       switch (to_elem_bt) {
5379         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5380         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5381         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5382         default: ShouldNotReachHere();
5383       }
5384       break;
5385     case T_SHORT:
5386       switch (to_elem_bt) {
5387         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5388         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5389         default: ShouldNotReachHere();
5390       }
5391       break;
5392     case T_INT:
5393       assert(to_elem_bt == T_LONG, "");
5394       vpmovsxdq(dst, src, vlen_enc);
5395       break;
5396     default:
5397       ShouldNotReachHere();
5398   }
5399 }
5400 
5401 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5402                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5403   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5404   assert(vlen_enc != AVX_512bit, "");
5405 
5406   int dst_bt_size = type2aelembytes(dst_bt);
5407   int src_bt_size = type2aelembytes(src_bt);
5408   if (dst_bt_size > src_bt_size) {
5409     switch (dst_bt_size / src_bt_size) {
5410       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5411       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5412       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5413       default: ShouldNotReachHere();
5414     }
5415   } else {
5416     assert(dst_bt_size < src_bt_size, "");
5417     switch (src_bt_size / dst_bt_size) {
5418       case 2: {
5419         if (vlen_enc == AVX_128bit) {
5420           vpacksswb(dst, src, src, vlen_enc);
5421         } else {
5422           vpacksswb(dst, src, src, vlen_enc);
5423           vpermq(dst, dst, 0x08, vlen_enc);
5424         }
5425         break;
5426       }
5427       case 4: {
5428         if (vlen_enc == AVX_128bit) {
5429           vpackssdw(dst, src, src, vlen_enc);
5430           vpacksswb(dst, dst, dst, vlen_enc);
5431         } else {
5432           vpackssdw(dst, src, src, vlen_enc);
5433           vpermq(dst, dst, 0x08, vlen_enc);
5434           vpacksswb(dst, dst, dst, AVX_128bit);
5435         }
5436         break;
5437       }
5438       case 8: {
5439         if (vlen_enc == AVX_128bit) {
5440           vpshufd(dst, src, 0x08, vlen_enc);
5441           vpackssdw(dst, dst, dst, vlen_enc);
5442           vpacksswb(dst, dst, dst, vlen_enc);
5443         } else {
5444           vpshufd(dst, src, 0x08, vlen_enc);
5445           vpermq(dst, dst, 0x08, vlen_enc);
5446           vpackssdw(dst, dst, dst, AVX_128bit);
5447           vpacksswb(dst, dst, dst, AVX_128bit);
5448         }
5449         break;
5450       }
5451       default: ShouldNotReachHere();
5452     }
5453   }
5454 }
5455 
5456 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5457                                    bool merge, BasicType bt, int vlen_enc) {
5458   if (bt == T_INT) {
5459     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5460   } else {
5461     assert(bt == T_LONG, "");
5462     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5463   }
5464 }
5465 
5466 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5467                                    bool merge, BasicType bt, int vlen_enc) {
5468   if (bt == T_INT) {
5469     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5470   } else {
5471     assert(bt == T_LONG, "");
5472     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5473   }
5474 }
5475 
5476 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5477                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5478                                                int vec_enc) {
5479   int index = 0;
5480   int vindex = 0;
5481   mov64(rtmp1, 0x0101010101010101L);
5482   pdepq(rtmp1, src, rtmp1);
5483   if (mask_len > 8) {
5484     movq(rtmp2, src);
5485     vpxor(xtmp, xtmp, xtmp, vec_enc);
5486     movq(xtmp, rtmp1);
5487   }
5488   movq(dst, rtmp1);
5489 
5490   mask_len -= 8;
5491   while (mask_len > 0) {
5492     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5493     index++;
5494     if ((index % 2) == 0) {
5495       pxor(xtmp, xtmp);
5496     }
5497     mov64(rtmp1, 0x0101010101010101L);
5498     shrq(rtmp2, 8);
5499     pdepq(rtmp1, rtmp2, rtmp1);
5500     pinsrq(xtmp, rtmp1, index % 2);
5501     vindex = index / 2;
5502     if (vindex) {
5503       // Write entire 16 byte vector when both 64 bit
5504       // lanes are update to save redundant instructions.
5505       if (index % 2) {
5506         vinsertf128(dst, dst, xtmp, vindex);
5507       }
5508     } else {
5509       vmovdqu(dst, xtmp);
5510     }
5511     mask_len -= 8;
5512   }
5513 }
5514 
5515 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5516   switch(opc) {
5517     case Op_VectorMaskTrueCount:
5518       popcntq(dst, tmp);
5519       break;
5520     case Op_VectorMaskLastTrue:
5521       if (VM_Version::supports_lzcnt()) {
5522         lzcntq(tmp, tmp);
5523         movl(dst, 63);
5524         subl(dst, tmp);
5525       } else {
5526         movl(dst, -1);
5527         bsrq(tmp, tmp);
5528         cmov32(Assembler::notZero, dst, tmp);
5529       }
5530       break;
5531     case Op_VectorMaskFirstTrue:
5532       if (VM_Version::supports_bmi1()) {
5533         if (masklen < 32) {
5534           orl(tmp, 1 << masklen);
5535           tzcntl(dst, tmp);
5536         } else if (masklen == 32) {
5537           tzcntl(dst, tmp);
5538         } else {
5539           assert(masklen == 64, "");
5540           tzcntq(dst, tmp);
5541         }
5542       } else {
5543         if (masklen < 32) {
5544           orl(tmp, 1 << masklen);
5545           bsfl(dst, tmp);
5546         } else {
5547           assert(masklen == 32 || masklen == 64, "");
5548           movl(dst, masklen);
5549           if (masklen == 32)  {
5550             bsfl(tmp, tmp);
5551           } else {
5552             bsfq(tmp, tmp);
5553           }
5554           cmov32(Assembler::notZero, dst, tmp);
5555         }
5556       }
5557       break;
5558     case Op_VectorMaskToLong:
5559       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5560       break;
5561     default: assert(false, "Unhandled mask operation");
5562   }
5563 }
5564 
5565 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5566                                               int masklen, int masksize, int vec_enc) {
5567   assert(VM_Version::supports_popcnt(), "");
5568 
5569   if(VM_Version::supports_avx512bw()) {
5570     kmovql(tmp, mask);
5571   } else {
5572     assert(masklen <= 16, "");
5573     kmovwl(tmp, mask);
5574   }
5575 
5576   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5577   // operations needs to be clipped.
5578   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5579     andq(tmp, (1 << masklen) - 1);
5580   }
5581 
5582   vector_mask_operation_helper(opc, dst, tmp, masklen);
5583 }
5584 
5585 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5586                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5587   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5588          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5589   assert(VM_Version::supports_popcnt(), "");
5590 
5591   bool need_clip = false;
5592   switch(bt) {
5593     case T_BOOLEAN:
5594       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5595       vpxor(xtmp, xtmp, xtmp, vec_enc);
5596       vpsubb(xtmp, xtmp, mask, vec_enc);
5597       vpmovmskb(tmp, xtmp, vec_enc);
5598       need_clip = masklen < 16;
5599       break;
5600     case T_BYTE:
5601       vpmovmskb(tmp, mask, vec_enc);
5602       need_clip = masklen < 16;
5603       break;
5604     case T_SHORT:
5605       vpacksswb(xtmp, mask, mask, vec_enc);
5606       if (masklen >= 16) {
5607         vpermpd(xtmp, xtmp, 8, vec_enc);
5608       }
5609       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5610       need_clip = masklen < 16;
5611       break;
5612     case T_INT:
5613     case T_FLOAT:
5614       vmovmskps(tmp, mask, vec_enc);
5615       need_clip = masklen < 4;
5616       break;
5617     case T_LONG:
5618     case T_DOUBLE:
5619       vmovmskpd(tmp, mask, vec_enc);
5620       need_clip = masklen < 2;
5621       break;
5622     default: assert(false, "Unhandled type, %s", type2name(bt));
5623   }
5624 
5625   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5626   // operations needs to be clipped.
5627   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5628     // need_clip implies masklen < 32
5629     andq(tmp, (1 << masklen) - 1);
5630   }
5631 
5632   vector_mask_operation_helper(opc, dst, tmp, masklen);
5633 }
5634 
5635 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5636                                              Register rtmp2, int mask_len) {
5637   kmov(rtmp1, src);
5638   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5639   mov64(rtmp2, -1L);
5640   pextq(rtmp2, rtmp2, rtmp1);
5641   kmov(dst, rtmp2);
5642 }
5643 
5644 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5645                                                     XMMRegister mask, Register rtmp, Register rscratch,
5646                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5647                                                     int vec_enc) {
5648   assert(type2aelembytes(bt) >= 4, "");
5649   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5650   address compress_perm_table = nullptr;
5651   address expand_perm_table = nullptr;
5652   if (type2aelembytes(bt) == 8) {
5653     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5654     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5655     vmovmskpd(rtmp, mask, vec_enc);
5656   } else {
5657     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5658     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5659     vmovmskps(rtmp, mask, vec_enc);
5660   }
5661   shlq(rtmp, 5); // for 32 byte permute row.
5662   if (opcode == Op_CompressV) {
5663     lea(rscratch, ExternalAddress(compress_perm_table));
5664   } else {
5665     lea(rscratch, ExternalAddress(expand_perm_table));
5666   }
5667   addptr(rtmp, rscratch);
5668   vmovdqu(permv, Address(rtmp));
5669   vpermps(dst, permv, src, Assembler::AVX_256bit);
5670   vpxor(xtmp, xtmp, xtmp, vec_enc);
5671   // Blend the result with zero vector using permute mask, each column entry
5672   // in a permute table row contains either a valid permute index or a -1 (default)
5673   // value, this can potentially be used as a blending mask after
5674   // compressing/expanding the source vector lanes.
5675   vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv);
5676 }
5677 
5678 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5679                                                bool merge, BasicType bt, int vec_enc) {
5680   if (opcode == Op_CompressV) {
5681     switch(bt) {
5682     case T_BYTE:
5683       evpcompressb(dst, mask, src, merge, vec_enc);
5684       break;
5685     case T_CHAR:
5686     case T_SHORT:
5687       evpcompressw(dst, mask, src, merge, vec_enc);
5688       break;
5689     case T_INT:
5690       evpcompressd(dst, mask, src, merge, vec_enc);
5691       break;
5692     case T_FLOAT:
5693       evcompressps(dst, mask, src, merge, vec_enc);
5694       break;
5695     case T_LONG:
5696       evpcompressq(dst, mask, src, merge, vec_enc);
5697       break;
5698     case T_DOUBLE:
5699       evcompresspd(dst, mask, src, merge, vec_enc);
5700       break;
5701     default:
5702       fatal("Unsupported type %s", type2name(bt));
5703       break;
5704     }
5705   } else {
5706     assert(opcode == Op_ExpandV, "");
5707     switch(bt) {
5708     case T_BYTE:
5709       evpexpandb(dst, mask, src, merge, vec_enc);
5710       break;
5711     case T_CHAR:
5712     case T_SHORT:
5713       evpexpandw(dst, mask, src, merge, vec_enc);
5714       break;
5715     case T_INT:
5716       evpexpandd(dst, mask, src, merge, vec_enc);
5717       break;
5718     case T_FLOAT:
5719       evexpandps(dst, mask, src, merge, vec_enc);
5720       break;
5721     case T_LONG:
5722       evpexpandq(dst, mask, src, merge, vec_enc);
5723       break;
5724     case T_DOUBLE:
5725       evexpandpd(dst, mask, src, merge, vec_enc);
5726       break;
5727     default:
5728       fatal("Unsupported type %s", type2name(bt));
5729       break;
5730     }
5731   }
5732 }
5733 
5734 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5735                                            KRegister ktmp1, int vec_enc) {
5736   if (opcode == Op_SignumVD) {
5737     vsubpd(dst, zero, one, vec_enc);
5738     // if src < 0 ? -1 : 1
5739     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5740     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5741     // if src == NaN, -0.0 or 0.0 return src.
5742     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5743     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5744   } else {
5745     assert(opcode == Op_SignumVF, "");
5746     vsubps(dst, zero, one, vec_enc);
5747     // if src < 0 ? -1 : 1
5748     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5749     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5750     // if src == NaN, -0.0 or 0.0 return src.
5751     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5752     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5753   }
5754 }
5755 
5756 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5757                                           XMMRegister xtmp1, int vec_enc) {
5758   if (opcode == Op_SignumVD) {
5759     vsubpd(dst, zero, one, vec_enc);
5760     // if src < 0 ? -1 : 1
5761     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5762     // if src == NaN, -0.0 or 0.0 return src.
5763     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5764     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5765   } else {
5766     assert(opcode == Op_SignumVF, "");
5767     vsubps(dst, zero, one, vec_enc);
5768     // if src < 0 ? -1 : 1
5769     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5770     // if src == NaN, -0.0 or 0.0 return src.
5771     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5772     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5773   }
5774 }
5775 
5776 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5777   if (VM_Version::supports_avx512bw()) {
5778     if (mask_len > 32) {
5779       kmovql(dst, src);
5780     } else {
5781       kmovdl(dst, src);
5782       if (mask_len != 32) {
5783         kshiftrdl(dst, dst, 32 - mask_len);
5784       }
5785     }
5786   } else {
5787     assert(mask_len <= 16, "");
5788     kmovwl(dst, src);
5789     if (mask_len != 16) {
5790       kshiftrwl(dst, dst, 16 - mask_len);
5791     }
5792   }
5793 }
5794 
5795 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5796   int lane_size = type2aelembytes(bt);
5797   if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5798       (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5799     movptr(rtmp, imm32);
5800     switch(lane_size) {
5801       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5802       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5803       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5804       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5805       fatal("Unsupported lane size %d", lane_size);
5806       break;
5807     }
5808   } else {
5809     movptr(rtmp, imm32);
5810     movq(dst, rtmp);
5811     switch(lane_size) {
5812       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5813       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5814       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5815       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5816       fatal("Unsupported lane size %d", lane_size);
5817       break;
5818     }
5819   }
5820 }
5821 
5822 //
5823 // Following is lookup table based popcount computation algorithm:-
5824 //       Index   Bit set count
5825 //     [ 0000 ->   0,
5826 //       0001 ->   1,
5827 //       0010 ->   1,
5828 //       0011 ->   2,
5829 //       0100 ->   1,
5830 //       0101 ->   2,
5831 //       0110 ->   2,
5832 //       0111 ->   3,
5833 //       1000 ->   1,
5834 //       1001 ->   2,
5835 //       1010 ->   3,
5836 //       1011 ->   3,
5837 //       1100 ->   2,
5838 //       1101 ->   3,
5839 //       1111 ->   4 ]
5840 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5841 //     shuffle indices for lookup table access.
5842 //  b. Right shift each byte of vector lane by 4 positions.
5843 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5844 //     shuffle indices for lookup table access.
5845 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5846 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5847 //     count of all the bytes of a quadword.
5848 //  f. Perform step e. for upper 128bit vector lane.
5849 //  g. Pack the bitset count of quadwords back to double word.
5850 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5851 
5852 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5853                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5854   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5855   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5856   vpsrlw(dst, src, 4, vec_enc);
5857   vpand(dst, dst, xtmp1, vec_enc);
5858   vpand(xtmp1, src, xtmp1, vec_enc);
5859   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5860   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5861   vpshufb(dst, xtmp2, dst, vec_enc);
5862   vpaddb(dst, dst, xtmp1, vec_enc);
5863 }
5864 
5865 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5866                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5867   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5868   // Following code is as per steps e,f,g and h of above algorithm.
5869   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5870   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5871   vpsadbw(dst, dst, xtmp2, vec_enc);
5872   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5873   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5874   vpackuswb(dst, xtmp1, dst, vec_enc);
5875 }
5876 
5877 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5878                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5879   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5880   // Add the popcount of upper and lower bytes of word.
5881   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5882   vpsrlw(dst, xtmp1, 8, vec_enc);
5883   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5884   vpaddw(dst, dst, xtmp1, vec_enc);
5885 }
5886 
5887 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5888                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5889   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5890   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5891   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5892 }
5893 
5894 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5895                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5896   switch(bt) {
5897     case T_LONG:
5898       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5899       break;
5900     case T_INT:
5901       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5902       break;
5903     case T_CHAR:
5904     case T_SHORT:
5905       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5906       break;
5907     case T_BYTE:
5908     case T_BOOLEAN:
5909       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5910       break;
5911     default:
5912       fatal("Unsupported type %s", type2name(bt));
5913       break;
5914   }
5915 }
5916 
5917 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5918                                                       KRegister mask, bool merge, int vec_enc) {
5919   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5920   switch(bt) {
5921     case T_LONG:
5922       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5923       evpopcntq(dst, mask, src, merge, vec_enc);
5924       break;
5925     case T_INT:
5926       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5927       evpopcntd(dst, mask, src, merge, vec_enc);
5928       break;
5929     case T_CHAR:
5930     case T_SHORT:
5931       assert(VM_Version::supports_avx512_bitalg(), "");
5932       evpopcntw(dst, mask, src, merge, vec_enc);
5933       break;
5934     case T_BYTE:
5935     case T_BOOLEAN:
5936       assert(VM_Version::supports_avx512_bitalg(), "");
5937       evpopcntb(dst, mask, src, merge, vec_enc);
5938       break;
5939     default:
5940       fatal("Unsupported type %s", type2name(bt));
5941       break;
5942   }
5943 }
5944 
5945 // Bit reversal algorithm first reverses the bits of each byte followed by
5946 // a byte level reversal for multi-byte primitive types (short/int/long).
5947 // Algorithm performs a lookup table access to get reverse bit sequence
5948 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5949 // is obtained by swapping the reverse bit sequences of upper and lower
5950 // nibble of a byte.
5951 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5952                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5953   if (VM_Version::supports_avx512vlbw()) {
5954 
5955     // Get the reverse bit sequence of lower nibble of each byte.
5956     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5957     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5958     evpandq(dst, xtmp2, src, vec_enc);
5959     vpshufb(dst, xtmp1, dst, vec_enc);
5960     vpsllq(dst, dst, 4, vec_enc);
5961 
5962     // Get the reverse bit sequence of upper nibble of each byte.
5963     vpandn(xtmp2, xtmp2, src, vec_enc);
5964     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5965     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5966 
5967     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5968     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5969     evporq(xtmp2, dst, xtmp2, vec_enc);
5970     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5971 
5972   } else if(vec_enc == Assembler::AVX_512bit) {
5973     // Shift based bit reversal.
5974     assert(bt == T_LONG || bt == T_INT, "");
5975 
5976     // Swap lower and upper nibble of each byte.
5977     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5978 
5979     // Swap two least and most significant bits of each nibble.
5980     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5981 
5982     // Swap adjacent pair of bits.
5983     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5984     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5985 
5986     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5987     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5988   } else {
5989     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5990     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5991 
5992     // Get the reverse bit sequence of lower nibble of each byte.
5993     vpand(dst, xtmp2, src, vec_enc);
5994     vpshufb(dst, xtmp1, dst, vec_enc);
5995     vpsllq(dst, dst, 4, vec_enc);
5996 
5997     // Get the reverse bit sequence of upper nibble of each byte.
5998     vpandn(xtmp2, xtmp2, src, vec_enc);
5999     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6000     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6001 
6002     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6003     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6004     vpor(xtmp2, dst, xtmp2, vec_enc);
6005     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6006   }
6007 }
6008 
6009 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
6010                                                 XMMRegister xtmp, Register rscratch) {
6011   assert(VM_Version::supports_gfni(), "");
6012   assert(rscratch != noreg || always_reachable(mask), "missing");
6013 
6014   // Galois field instruction based bit reversal based on following algorithm.
6015   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6016   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
6017   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
6018   vector_reverse_byte(bt, dst, xtmp, vec_enc);
6019 }
6020 
6021 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
6022                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
6023   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6024   evpandq(dst, xtmp1, src, vec_enc);
6025   vpsllq(dst, dst, nbits, vec_enc);
6026   vpandn(xtmp1, xtmp1, src, vec_enc);
6027   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6028   evporq(dst, dst, xtmp1, vec_enc);
6029 }
6030 
6031 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6032                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
6033   // Shift based bit reversal.
6034   assert(VM_Version::supports_evex(), "");
6035   switch(bt) {
6036     case T_LONG:
6037       // Swap upper and lower double word of each quad word.
6038       evprorq(xtmp1, k0, src, 32, true, vec_enc);
6039       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6040       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6041       break;
6042     case T_INT:
6043       // Swap upper and lower word of each double word.
6044       evprord(xtmp1, k0, src, 16, true, vec_enc);
6045       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6046       break;
6047     case T_CHAR:
6048     case T_SHORT:
6049       // Swap upper and lower byte of each word.
6050       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6051       break;
6052     case T_BYTE:
6053       evmovdquq(dst, k0, src, true, vec_enc);
6054       break;
6055     default:
6056       fatal("Unsupported type %s", type2name(bt));
6057       break;
6058   }
6059 }
6060 
6061 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6062   if (bt == T_BYTE) {
6063     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6064       evmovdquq(dst, k0, src, true, vec_enc);
6065     } else {
6066       vmovdqu(dst, src);
6067     }
6068     return;
6069   }
6070   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6071   // pre-computed shuffle indices.
6072   switch(bt) {
6073     case T_LONG:
6074       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6075       break;
6076     case T_INT:
6077       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6078       break;
6079     case T_CHAR:
6080     case T_SHORT:
6081       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6082       break;
6083     default:
6084       fatal("Unsupported type %s", type2name(bt));
6085       break;
6086   }
6087   vpshufb(dst, src, dst, vec_enc);
6088 }
6089 
6090 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6091                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6092                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6093   assert(is_integral_type(bt), "");
6094   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6095   assert(VM_Version::supports_avx512cd(), "");
6096   switch(bt) {
6097     case T_LONG:
6098       evplzcntq(dst, ktmp, src, merge, vec_enc);
6099       break;
6100     case T_INT:
6101       evplzcntd(dst, ktmp, src, merge, vec_enc);
6102       break;
6103     case T_SHORT:
6104       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6105       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6106       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6107       vpunpckhwd(dst, xtmp1, src, vec_enc);
6108       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6109       vpackusdw(dst, xtmp2, dst, vec_enc);
6110       break;
6111     case T_BYTE:
6112       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6113       // accessing the lookup table.
6114       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6115       // accessing the lookup table.
6116       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6117       assert(VM_Version::supports_avx512bw(), "");
6118       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6119       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6120       vpand(xtmp2, dst, src, vec_enc);
6121       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6122       vpsrlw(xtmp3, src, 4, vec_enc);
6123       vpand(xtmp3, dst, xtmp3, vec_enc);
6124       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6125       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6126       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6127       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6128       break;
6129     default:
6130       fatal("Unsupported type %s", type2name(bt));
6131       break;
6132   }
6133 }
6134 
6135 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6136                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6137   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6138   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6139   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6140   // accessing the lookup table.
6141   vpand(dst, xtmp2, src, vec_enc);
6142   vpshufb(dst, xtmp1, dst, vec_enc);
6143   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6144   // accessing the lookup table.
6145   vpsrlw(xtmp3, src, 4, vec_enc);
6146   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6147   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6148   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6149   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6150   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6151   vpaddb(dst, dst, xtmp2, vec_enc);
6152   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6153 }
6154 
6155 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6156                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6157   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6158   // Add zero counts of lower byte and upper byte of a word if
6159   // upper byte holds a zero value.
6160   vpsrlw(xtmp3, src, 8, vec_enc);
6161   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6162   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6163   vpsllw(xtmp2, dst, 8, vec_enc);
6164   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6165   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6166   vpsrlw(dst, dst, 8, vec_enc);
6167 }
6168 
6169 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6170                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6171   // By converting the integer to a float, we can obtain the number of leading zeros based on the exponent of the float.
6172   // As the float exponent contains a bias of 127 for nonzero values, the bias must be removed before interpreting the
6173   // exponent as the leading zero count.
6174 
6175   // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6176   // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6177   // contributes to the leading number of zeros.
6178   vpsrld(dst, src, 1, vec_enc);
6179   vpandn(dst, dst, src, vec_enc);
6180 
6181   vcvtdq2ps(dst, dst, vec_enc);
6182 
6183   // By comparing the register to itself, all the bits in the destination are set.
6184   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6185 
6186   // Move the biased exponent to the low end of the lane and mask with 0xFF to discard the sign bit.
6187   vpsrld(xtmp2, xtmp1, 24, vec_enc);
6188   vpsrld(dst, dst, 23, vec_enc);
6189   vpand(dst, xtmp2, dst, vec_enc);
6190 
6191   // Subtract 127 from the exponent, which removes the bias from the exponent.
6192   vpsrld(xtmp2, xtmp1, 25, vec_enc);
6193   vpsubd(dst, dst, xtmp2, vec_enc);
6194 
6195   vpsrld(xtmp2, xtmp1, 27, vec_enc);
6196 
6197   // If the original value is 0 the exponent would not have bias, so the subtraction creates a negative number. If this
6198   // is found in any of the lanes, replace the lane with -1 from xtmp1.
6199   vblendvps(dst, dst, xtmp1, dst, vec_enc, true, xtmp3);
6200 
6201   // If the original value is negative, replace the lane with 31.
6202   vblendvps(dst, dst, xtmp2, src, vec_enc, true, xtmp3);
6203 
6204   // Subtract the exponent from 31, giving the final result. For 0, the result is 32 as the exponent was replaced with -1,
6205   // and for negative numbers the result is 0 as the exponent was replaced with 31.
6206   vpsubd(dst, xtmp2, dst, vec_enc);
6207 }
6208 
6209 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6210                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6211   // Find the leading zeros of the top and bottom halves of the long individually.
6212   vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6213 
6214   // Move the top half result to the bottom half of xtmp1, setting the top half to 0.
6215   vpsrlq(xtmp1, dst, 32, vec_enc);
6216   // By moving the top half result to the right by 6 bits, if the top half was empty (i.e. 32 is returned) the result bit will
6217   // be in the most significant position of the bottom half.
6218   vpsrlq(xtmp2, dst, 6, vec_enc);
6219 
6220   // In the bottom half, add the top half and bottom half results.
6221   vpaddq(dst, xtmp1, dst, vec_enc);
6222 
6223   // For the bottom half, choose between the values using the most significant bit of xtmp2.
6224   // If the MSB is set, then bottom+top in dst is the resulting value. If the top half is less than 32 xtmp1 is chosen,
6225   // which contains only the top half result.
6226   // In the top half the MSB is always zero, so the value in xtmp1 is always chosen. This value is always 0, which clears
6227   // the lane as required.
6228   vblendvps(dst, xtmp1, dst, xtmp2, vec_enc, true, xtmp3);
6229 }
6230 
6231 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6232                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6233                                                        Register rtmp, int vec_enc) {
6234   assert(is_integral_type(bt), "unexpected type");
6235   assert(vec_enc < Assembler::AVX_512bit, "");
6236   switch(bt) {
6237     case T_LONG:
6238       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6239       break;
6240     case T_INT:
6241       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6242       break;
6243     case T_SHORT:
6244       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6245       break;
6246     case T_BYTE:
6247       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6248       break;
6249     default:
6250       fatal("Unsupported type %s", type2name(bt));
6251       break;
6252   }
6253 }
6254 
6255 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6256   switch(bt) {
6257     case T_BYTE:
6258       vpsubb(dst, src1, src2, vec_enc);
6259       break;
6260     case T_SHORT:
6261       vpsubw(dst, src1, src2, vec_enc);
6262       break;
6263     case T_INT:
6264       vpsubd(dst, src1, src2, vec_enc);
6265       break;
6266     case T_LONG:
6267       vpsubq(dst, src1, src2, vec_enc);
6268       break;
6269     default:
6270       fatal("Unsupported type %s", type2name(bt));
6271       break;
6272   }
6273 }
6274 
6275 // Trailing zero count computation is based on leading zero count operation as per
6276 // following equation. All AVX3 targets support AVX512CD feature which offers
6277 // direct vector instruction to compute leading zero count.
6278 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6279 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6280                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6281                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6282   assert(is_integral_type(bt), "");
6283   // xtmp = -1
6284   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6285   // xtmp = xtmp + src
6286   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6287   // xtmp = xtmp & ~src
6288   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6289   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6290   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6291   vpsub(bt, dst, xtmp4, dst, vec_enc);
6292 }
6293 
6294 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6295 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6296 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6297                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6298   assert(is_integral_type(bt), "");
6299   // xtmp = 0
6300   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6301   // xtmp = 0 - src
6302   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6303   // xtmp = xtmp | src
6304   vpor(xtmp3, xtmp3, src, vec_enc);
6305   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6306   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6307   vpsub(bt, dst, xtmp1, dst, vec_enc);
6308 }
6309 
6310 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6311   Label done;
6312   Label neg_divisor_fastpath;
6313   cmpl(divisor, 0);
6314   jccb(Assembler::less, neg_divisor_fastpath);
6315   xorl(rdx, rdx);
6316   divl(divisor);
6317   jmpb(done);
6318   bind(neg_divisor_fastpath);
6319   // Fastpath for divisor < 0:
6320   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6321   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6322   movl(rdx, rax);
6323   subl(rdx, divisor);
6324   if (VM_Version::supports_bmi1()) {
6325     andnl(rax, rdx, rax);
6326   } else {
6327     notl(rdx);
6328     andl(rax, rdx);
6329   }
6330   shrl(rax, 31);
6331   bind(done);
6332 }
6333 
6334 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6335   Label done;
6336   Label neg_divisor_fastpath;
6337   cmpl(divisor, 0);
6338   jccb(Assembler::less, neg_divisor_fastpath);
6339   xorl(rdx, rdx);
6340   divl(divisor);
6341   jmpb(done);
6342   bind(neg_divisor_fastpath);
6343   // Fastpath when divisor < 0:
6344   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6345   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6346   movl(rdx, rax);
6347   subl(rax, divisor);
6348   if (VM_Version::supports_bmi1()) {
6349     andnl(rax, rax, rdx);
6350   } else {
6351     notl(rax);
6352     andl(rax, rdx);
6353   }
6354   sarl(rax, 31);
6355   andl(rax, divisor);
6356   subl(rdx, rax);
6357   bind(done);
6358 }
6359 
6360 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6361   Label done;
6362   Label neg_divisor_fastpath;
6363 
6364   cmpl(divisor, 0);
6365   jccb(Assembler::less, neg_divisor_fastpath);
6366   xorl(rdx, rdx);
6367   divl(divisor);
6368   jmpb(done);
6369   bind(neg_divisor_fastpath);
6370   // Fastpath for divisor < 0:
6371   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6372   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6373   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6374   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6375   movl(rdx, rax);
6376   subl(rax, divisor);
6377   if (VM_Version::supports_bmi1()) {
6378     andnl(rax, rax, rdx);
6379   } else {
6380     notl(rax);
6381     andl(rax, rdx);
6382   }
6383   movl(tmp, rax);
6384   shrl(rax, 31); // quotient
6385   sarl(tmp, 31);
6386   andl(tmp, divisor);
6387   subl(rdx, tmp); // remainder
6388   bind(done);
6389 }
6390 
6391 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6392                                  XMMRegister xtmp2, Register rtmp) {
6393   if(VM_Version::supports_gfni()) {
6394     // Galois field instruction based bit reversal based on following algorithm.
6395     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6396     mov64(rtmp, 0x8040201008040201L);
6397     movq(xtmp1, src);
6398     movq(xtmp2, rtmp);
6399     gf2p8affineqb(xtmp1, xtmp2, 0);
6400     movq(dst, xtmp1);
6401   } else {
6402     // Swap even and odd numbered bits.
6403     movl(rtmp, src);
6404     andl(rtmp, 0x55555555);
6405     shll(rtmp, 1);
6406     movl(dst, src);
6407     andl(dst, 0xAAAAAAAA);
6408     shrl(dst, 1);
6409     orl(dst, rtmp);
6410 
6411     // Swap LSB and MSB 2 bits of each nibble.
6412     movl(rtmp, dst);
6413     andl(rtmp, 0x33333333);
6414     shll(rtmp, 2);
6415     andl(dst, 0xCCCCCCCC);
6416     shrl(dst, 2);
6417     orl(dst, rtmp);
6418 
6419     // Swap LSB and MSB 4 bits of each byte.
6420     movl(rtmp, dst);
6421     andl(rtmp, 0x0F0F0F0F);
6422     shll(rtmp, 4);
6423     andl(dst, 0xF0F0F0F0);
6424     shrl(dst, 4);
6425     orl(dst, rtmp);
6426   }
6427   bswapl(dst);
6428 }
6429 
6430 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6431                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6432   if(VM_Version::supports_gfni()) {
6433     // Galois field instruction based bit reversal based on following algorithm.
6434     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6435     mov64(rtmp1, 0x8040201008040201L);
6436     movq(xtmp1, src);
6437     movq(xtmp2, rtmp1);
6438     gf2p8affineqb(xtmp1, xtmp2, 0);
6439     movq(dst, xtmp1);
6440   } else {
6441     // Swap even and odd numbered bits.
6442     movq(rtmp1, src);
6443     mov64(rtmp2, 0x5555555555555555L);
6444     andq(rtmp1, rtmp2);
6445     shlq(rtmp1, 1);
6446     movq(dst, src);
6447     notq(rtmp2);
6448     andq(dst, rtmp2);
6449     shrq(dst, 1);
6450     orq(dst, rtmp1);
6451 
6452     // Swap LSB and MSB 2 bits of each nibble.
6453     movq(rtmp1, dst);
6454     mov64(rtmp2, 0x3333333333333333L);
6455     andq(rtmp1, rtmp2);
6456     shlq(rtmp1, 2);
6457     notq(rtmp2);
6458     andq(dst, rtmp2);
6459     shrq(dst, 2);
6460     orq(dst, rtmp1);
6461 
6462     // Swap LSB and MSB 4 bits of each byte.
6463     movq(rtmp1, dst);
6464     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6465     andq(rtmp1, rtmp2);
6466     shlq(rtmp1, 4);
6467     notq(rtmp2);
6468     andq(dst, rtmp2);
6469     shrq(dst, 4);
6470     orq(dst, rtmp1);
6471   }
6472   bswapq(dst);
6473 }
6474 
6475 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6476   Label done;
6477   Label neg_divisor_fastpath;
6478   cmpq(divisor, 0);
6479   jccb(Assembler::less, neg_divisor_fastpath);
6480   xorl(rdx, rdx);
6481   divq(divisor);
6482   jmpb(done);
6483   bind(neg_divisor_fastpath);
6484   // Fastpath for divisor < 0:
6485   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6486   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6487   movq(rdx, rax);
6488   subq(rdx, divisor);
6489   if (VM_Version::supports_bmi1()) {
6490     andnq(rax, rdx, rax);
6491   } else {
6492     notq(rdx);
6493     andq(rax, rdx);
6494   }
6495   shrq(rax, 63);
6496   bind(done);
6497 }
6498 
6499 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6500   Label done;
6501   Label neg_divisor_fastpath;
6502   cmpq(divisor, 0);
6503   jccb(Assembler::less, neg_divisor_fastpath);
6504   xorq(rdx, rdx);
6505   divq(divisor);
6506   jmp(done);
6507   bind(neg_divisor_fastpath);
6508   // Fastpath when divisor < 0:
6509   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6510   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6511   movq(rdx, rax);
6512   subq(rax, divisor);
6513   if (VM_Version::supports_bmi1()) {
6514     andnq(rax, rax, rdx);
6515   } else {
6516     notq(rax);
6517     andq(rax, rdx);
6518   }
6519   sarq(rax, 63);
6520   andq(rax, divisor);
6521   subq(rdx, rax);
6522   bind(done);
6523 }
6524 
6525 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6526   Label done;
6527   Label neg_divisor_fastpath;
6528   cmpq(divisor, 0);
6529   jccb(Assembler::less, neg_divisor_fastpath);
6530   xorq(rdx, rdx);
6531   divq(divisor);
6532   jmp(done);
6533   bind(neg_divisor_fastpath);
6534   // Fastpath for divisor < 0:
6535   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6536   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6537   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6538   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6539   movq(rdx, rax);
6540   subq(rax, divisor);
6541   if (VM_Version::supports_bmi1()) {
6542     andnq(rax, rax, rdx);
6543   } else {
6544     notq(rax);
6545     andq(rax, rdx);
6546   }
6547   movq(tmp, rax);
6548   shrq(rax, 63); // quotient
6549   sarq(tmp, 63);
6550   andq(tmp, divisor);
6551   subq(rdx, tmp); // remainder
6552   bind(done);
6553 }
6554 
6555 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6556                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6557                                         int vlen_enc) {
6558   assert(VM_Version::supports_avx512bw(), "");
6559   // Byte shuffles are inlane operations and indices are determined using
6560   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6561   // normalized to index range 0-15. This makes sure that all the multiples
6562   // of an index value are placed at same relative position in 128 bit
6563   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6564   // will be 16th element in their respective 128 bit lanes.
6565   movl(rtmp, 16);
6566   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6567 
6568   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6569   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6570   // original shuffle indices and move the shuffled lanes corresponding to true
6571   // mask to destination vector.
6572   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6573   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6574   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6575 
6576   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6577   // and broadcasting second 128 bit lane.
6578   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6579   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6580   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6581   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6582   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6583 
6584   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6585   // and broadcasting third 128 bit lane.
6586   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6587   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6588   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6589   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6590   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6591 
6592   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6593   // and broadcasting third 128 bit lane.
6594   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6595   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6596   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6597   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6598   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6599 }
6600 
6601 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6602                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6603   if (vlen_enc == AVX_128bit) {
6604     vpermilps(dst, src, shuffle, vlen_enc);
6605   } else if (bt == T_INT) {
6606     vpermd(dst, shuffle, src, vlen_enc);
6607   } else {
6608     assert(bt == T_FLOAT, "");
6609     vpermps(dst, shuffle, src, vlen_enc);
6610   }
6611 }
6612 
6613 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6614   switch(opcode) {
6615     case Op_AddHF: vaddsh(dst, src1, src2); break;
6616     case Op_SubHF: vsubsh(dst, src1, src2); break;
6617     case Op_MulHF: vmulsh(dst, src1, src2); break;
6618     case Op_DivHF: vdivsh(dst, src1, src2); break;
6619     default: assert(false, "%s", NodeClassNames[opcode]); break;
6620   }
6621 }
6622 
6623 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6624   switch(elem_bt) {
6625     case T_BYTE:
6626       if (ideal_opc == Op_SaturatingAddV) {
6627         vpaddsb(dst, src1, src2, vlen_enc);
6628       } else {
6629         assert(ideal_opc == Op_SaturatingSubV, "");
6630         vpsubsb(dst, src1, src2, vlen_enc);
6631       }
6632       break;
6633     case T_SHORT:
6634       if (ideal_opc == Op_SaturatingAddV) {
6635         vpaddsw(dst, src1, src2, vlen_enc);
6636       } else {
6637         assert(ideal_opc == Op_SaturatingSubV, "");
6638         vpsubsw(dst, src1, src2, vlen_enc);
6639       }
6640       break;
6641     default:
6642       fatal("Unsupported type %s", type2name(elem_bt));
6643       break;
6644   }
6645 }
6646 
6647 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6648   switch(elem_bt) {
6649     case T_BYTE:
6650       if (ideal_opc == Op_SaturatingAddV) {
6651         vpaddusb(dst, src1, src2, vlen_enc);
6652       } else {
6653         assert(ideal_opc == Op_SaturatingSubV, "");
6654         vpsubusb(dst, src1, src2, vlen_enc);
6655       }
6656       break;
6657     case T_SHORT:
6658       if (ideal_opc == Op_SaturatingAddV) {
6659         vpaddusw(dst, src1, src2, vlen_enc);
6660       } else {
6661         assert(ideal_opc == Op_SaturatingSubV, "");
6662         vpsubusw(dst, src1, src2, vlen_enc);
6663       }
6664       break;
6665     default:
6666       fatal("Unsupported type %s", type2name(elem_bt));
6667       break;
6668   }
6669 }
6670 
6671 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6672                                                               XMMRegister src2, KRegister ktmp, int vlen_enc) {
6673   // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6674   // overflow_mask = Inp1 <u Inp2
6675   evpcmpu(elem_bt, ktmp,  src2, src1, Assembler::lt, vlen_enc);
6676   // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6677   evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6678 }
6679 
6680 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6681                                                               XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6682   // Emulate unsigned comparison using signed comparison
6683   // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6684   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6685   vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6686   vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6687 
6688   vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6689 
6690   // Res = INP1 - INP2 (non-commutative and non-associative)
6691   vpsub(elem_bt, dst, src1, src2, vlen_enc);
6692   // Res = Mask ? Zero : Res
6693   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6694   vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6695 }
6696 
6697 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6698                                                                XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6699   // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6700   // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6701   // Res = Signed Add INP1, INP2
6702   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6703   // T1 = SRC1 | SRC2
6704   vpor(xtmp1, src1, src2, vlen_enc);
6705   // Max_Unsigned = -1
6706   vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6707   // Unsigned compare:  Mask = Res <u T1
6708   evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6709   // res  = Mask ? Max_Unsigned : Res
6710   evpblend(elem_bt, dst, ktmp,  dst, xtmp2, true, vlen_enc);
6711 }
6712 
6713 //
6714 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6715 // unsigned addition operation.
6716 //    overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6717 //
6718 // We empirically determined its semantic equivalence to following reduced expression
6719 //    overflow_mask =  (a + b) <u (a | b)
6720 //
6721 // and also verified it though Alive2 solver.
6722 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6723 //
6724 
6725 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6726                                                               XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6727   // Res = Signed Add INP1, INP2
6728   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6729   // Compute T1 = INP1 | INP2
6730   vpor(xtmp3, src1, src2, vlen_enc);
6731   // T1 = Minimum signed value.
6732   vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6733   // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6734   vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6735   // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6736   vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6737   // Compute overflow detection mask = Res<1> <s T1
6738   if (elem_bt == T_INT) {
6739     vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6740   } else {
6741     assert(elem_bt == T_LONG, "");
6742     vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6743   }
6744   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6745 }
6746 
6747 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6748                                       int vlen_enc, bool xtmp2_hold_M1) {
6749   if (VM_Version::supports_avx512dq()) {
6750     evpmovq2m(ktmp, src, vlen_enc);
6751   } else {
6752     assert(VM_Version::supports_evex(), "");
6753     if (!xtmp2_hold_M1) {
6754       vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6755     }
6756     evpsraq(xtmp1, src, 63, vlen_enc);
6757     evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6758   }
6759 }
6760 
6761 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6762                                       int vlen_enc, bool xtmp2_hold_M1) {
6763   if (VM_Version::supports_avx512dq()) {
6764     evpmovd2m(ktmp, src, vlen_enc);
6765   } else {
6766     assert(VM_Version::supports_evex(), "");
6767     if (!xtmp2_hold_M1) {
6768       vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6769     }
6770     vpsrad(xtmp1, src, 31, vlen_enc);
6771     Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6772   }
6773 }
6774 
6775 
6776 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6777   if (elem_bt == T_LONG) {
6778     if (VM_Version::supports_evex()) {
6779       evpsraq(dst, src, 63, vlen_enc);
6780     } else {
6781       vpsrad(dst, src, 31, vlen_enc);
6782       vpshufd(dst, dst, 0xF5, vlen_enc);
6783     }
6784   } else {
6785     assert(elem_bt == T_INT, "");
6786     vpsrad(dst, src, 31, vlen_enc);
6787   }
6788 }
6789 
6790 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6791   if (compute_allones) {
6792     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6793       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6794     } else {
6795       vpcmpeqq(allones, allones, allones, vlen_enc);
6796     }
6797   }
6798   if (elem_bt == T_LONG) {
6799     vpsrlq(dst, allones, 1, vlen_enc);
6800   } else {
6801     assert(elem_bt == T_INT, "");
6802     vpsrld(dst, allones, 1, vlen_enc);
6803   }
6804 }
6805 
6806 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6807   if (compute_allones) {
6808     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6809       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6810     } else {
6811       vpcmpeqq(allones, allones, allones, vlen_enc);
6812     }
6813   }
6814   if (elem_bt == T_LONG) {
6815     vpsllq(dst, allones, 63, vlen_enc);
6816   } else {
6817     assert(elem_bt == T_INT, "");
6818     vpslld(dst, allones, 31, vlen_enc);
6819   }
6820 }
6821 
6822 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask,  XMMRegister src1, XMMRegister src2,
6823                                 Assembler::ComparisonPredicate cond, int vlen_enc) {
6824   switch(elem_bt) {
6825     case T_LONG:  evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6826     case T_INT:   evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6827     case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6828     case T_BYTE:  evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6829     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6830   }
6831 }
6832 
6833 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6834   switch(elem_bt) {
6835     case  T_LONG:  vpcmpgtq(dst, src1, src2, vlen_enc); break;
6836     case  T_INT:   vpcmpgtd(dst, src1, src2, vlen_enc); break;
6837     case  T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6838     case  T_BYTE:  vpcmpgtb(dst, src1, src2, vlen_enc); break;
6839     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6840   }
6841 }
6842 
6843 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6844                                            XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6845   if (elem_bt == T_LONG) {
6846     evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6847   } else {
6848     assert(elem_bt == T_INT, "");
6849     evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6850   }
6851 }
6852 
6853 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6854                                                          XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6855                                                          KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6856   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6857   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6858   // Overflow detection based on Hacker's delight section 2-13.
6859   if (ideal_opc == Op_SaturatingAddV) {
6860     // res = src1 + src2
6861     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6862     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6863     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6864     vpxor(xtmp1, dst, src1, vlen_enc);
6865     vpxor(xtmp2, dst, src2, vlen_enc);
6866     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6867   } else {
6868     assert(ideal_opc == Op_SaturatingSubV, "");
6869     // res = src1 - src2
6870     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6871     // Overflow occurs when both inputs have opposite polarity and
6872     // result polarity does not comply with first input polarity.
6873     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6874     vpxor(xtmp1, src1, src2, vlen_enc);
6875     vpxor(xtmp2, dst, src1, vlen_enc);
6876     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6877   }
6878 
6879   // Compute overflow detection mask.
6880   evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6881   // Note: xtmp1 hold -1 in all its lanes after above call.
6882 
6883   // Compute mask based on first input polarity.
6884   evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6885 
6886   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6887   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6888 
6889   // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6890   // set bits in first input polarity mask holds a min value.
6891   evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6892   // Blend destination lanes with saturated values using overflow detection mask.
6893   evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6894 }
6895 
6896 
6897 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6898                                                         XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6899                                                         XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6900   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6901   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6902   // Overflow detection based on Hacker's delight section 2-13.
6903   if (ideal_opc == Op_SaturatingAddV) {
6904     // res = src1 + src2
6905     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6906     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6907     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6908     vpxor(xtmp1, dst, src1, vlen_enc);
6909     vpxor(xtmp2, dst, src2, vlen_enc);
6910     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6911   } else {
6912     assert(ideal_opc == Op_SaturatingSubV, "");
6913     // res = src1 - src2
6914     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6915     // Overflow occurs when both inputs have opposite polarity and
6916     // result polarity does not comply with first input polarity.
6917     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6918     vpxor(xtmp1, src1, src2, vlen_enc);
6919     vpxor(xtmp2, dst, src1, vlen_enc);
6920     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6921   }
6922 
6923   // Sign-extend to compute overflow detection mask.
6924   vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6925 
6926   vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6927   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6928   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6929 
6930   // Compose saturating min/max vector using first input polarity mask.
6931   vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
6932   vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
6933 
6934   // Blend result with saturating vector using overflow detection mask.
6935   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6936 }
6937 
6938 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6939   switch(elem_bt) {
6940     case T_BYTE:
6941       if (ideal_opc == Op_SaturatingAddV) {
6942         vpaddsb(dst, src1, src2, vlen_enc);
6943       } else {
6944         assert(ideal_opc == Op_SaturatingSubV, "");
6945         vpsubsb(dst, src1, src2, vlen_enc);
6946       }
6947       break;
6948     case T_SHORT:
6949       if (ideal_opc == Op_SaturatingAddV) {
6950         vpaddsw(dst, src1, src2, vlen_enc);
6951       } else {
6952         assert(ideal_opc == Op_SaturatingSubV, "");
6953         vpsubsw(dst, src1, src2, vlen_enc);
6954       }
6955       break;
6956     default:
6957       fatal("Unsupported type %s", type2name(elem_bt));
6958       break;
6959   }
6960 }
6961 
6962 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6963   switch(elem_bt) {
6964     case T_BYTE:
6965       if (ideal_opc == Op_SaturatingAddV) {
6966         vpaddusb(dst, src1, src2, vlen_enc);
6967       } else {
6968         assert(ideal_opc == Op_SaturatingSubV, "");
6969         vpsubusb(dst, src1, src2, vlen_enc);
6970       }
6971       break;
6972     case T_SHORT:
6973       if (ideal_opc == Op_SaturatingAddV) {
6974         vpaddusw(dst, src1, src2, vlen_enc);
6975       } else {
6976         assert(ideal_opc == Op_SaturatingSubV, "");
6977         vpsubusw(dst, src1, src2, vlen_enc);
6978       }
6979       break;
6980     default:
6981       fatal("Unsupported type %s", type2name(elem_bt));
6982       break;
6983   }
6984 }
6985 
6986 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6987                                                      XMMRegister src2, int vlen_enc) {
6988   switch(elem_bt) {
6989     case T_BYTE:
6990       evpermi2b(dst, src1, src2, vlen_enc);
6991       break;
6992     case T_SHORT:
6993       evpermi2w(dst, src1, src2, vlen_enc);
6994       break;
6995     case T_INT:
6996       evpermi2d(dst, src1, src2, vlen_enc);
6997       break;
6998     case T_LONG:
6999       evpermi2q(dst, src1, src2, vlen_enc);
7000       break;
7001     case T_FLOAT:
7002       evpermi2ps(dst, src1, src2, vlen_enc);
7003       break;
7004     case T_DOUBLE:
7005       evpermi2pd(dst, src1, src2, vlen_enc);
7006       break;
7007     default:
7008       fatal("Unsupported type %s", type2name(elem_bt));
7009       break;
7010   }
7011 }
7012 
7013 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
7014   if (is_unsigned) {
7015     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7016   } else {
7017     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7018   }
7019 }
7020 
7021 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
7022   if (is_unsigned) {
7023     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7024   } else {
7025     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7026   }
7027 }
7028 
7029 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
7030   switch(opcode) {
7031     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7032     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7033     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7034     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7035     default: assert(false, "%s", NodeClassNames[opcode]); break;
7036   }
7037 }
7038 
7039 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7040   switch(opcode) {
7041     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7042     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7043     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7044     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7045     default: assert(false, "%s", NodeClassNames[opcode]); break;
7046   }
7047 }
7048 
7049 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7050                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
7051   vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
7052 }
7053 
7054 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7055                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
7056   if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
7057     // Move sign bits of src2 to mask register.
7058     evpmovw2m(ktmp, src2, vlen_enc);
7059     // xtmp1 = src2 < 0 ? src2 : src1
7060     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7061     // xtmp2 = src2 < 0 ? ? src1 : src2
7062     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7063     // Idea behind above swapping is to make seconds source operand a +ve value.
7064     // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7065     // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7066     // the second source operand, either a NaN or a valid floating-point value, is returned
7067     // dst = max(xtmp1, xtmp2)
7068     evmaxph(dst, xtmp1, xtmp2, vlen_enc);
7069     // isNaN = is_unordered_quiet(xtmp1)
7070     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7071     // Final result is same as first source if its a NaN value,
7072     // in case second operand holds a NaN value then as per above semantics
7073     // result is same as second operand.
7074     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7075   } else {
7076     assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
7077     // Move sign bits of src1 to mask register.
7078     evpmovw2m(ktmp, src1, vlen_enc);
7079     // xtmp1 = src1 < 0 ? src2 : src1
7080     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7081     // xtmp2 = src1 < 0 ? src1 : src2
7082     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7083     // Idea behind above swapping is to make seconds source operand a -ve value.
7084     // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7085     // the second source operand is returned.
7086     // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7087     // or a valid floating-point value, is written to the result.
7088     // dst = min(xtmp1, xtmp2)
7089     evminph(dst, xtmp1, xtmp2, vlen_enc);
7090     // isNaN = is_unordered_quiet(xtmp1)
7091     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7092     // Final result is same as first source if its a NaN value,
7093     // in case second operand holds a NaN value then as per above semantics
7094     // result is same as second operand.
7095     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7096   }
7097 }
--- EOF ---