1 /*
   2  * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "gc/shared/barrierSet.hpp"
  28 #include "gc/shared/barrierSetAssembler.hpp"
  29 #include "oops/methodData.hpp"
  30 #include "opto/c2_MacroAssembler.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/opcodes.hpp"
  34 #include "opto/subnode.hpp"
  35 #include "runtime/globals.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 #include "utilities/checkedCast.hpp"
  39 #include "utilities/globalDefinitions.hpp"
  40 #include "utilities/powerOfTwo.hpp"
  41 #include "utilities/sizes.hpp"
  42 
  43 #ifdef PRODUCT
  44 #define BLOCK_COMMENT(str) /* nothing */
  45 #define STOP(error) stop(error)
  46 #else
  47 #define BLOCK_COMMENT(str) block_comment(str)
  48 #define STOP(error) block_comment(error); stop(error)
  49 #endif
  50 
  51 // C2 compiled method's prolog code.
  52 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {























  53   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  54 
  55   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  56   // Remove word for return addr
  57   framesize -= wordSize;
  58   stack_bang_size -= wordSize;
  59 
  60   // Calls to C2R adapters often do not accept exceptional returns.
  61   // We require that their callers must bang for them.  But be careful, because
  62   // some VM calls (such as call site linkage) can use several kilobytes of
  63   // stack.  But the stack safety zone should account for that.
  64   // See bugs 4446381, 4468289, 4497237.
  65   if (stack_bang_size > 0) {
  66     generate_stack_overflow_check(stack_bang_size);
  67 
  68     // We always push rbp, so that on return to interpreter rbp, will be
  69     // restored correctly and we can correct the stack.
  70     push(rbp);






  71     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  72     if (PreserveFramePointer) {
  73       mov(rbp, rsp);
  74     }
  75     // Remove word for ebp
  76     framesize -= wordSize;
  77 
  78     // Create frame
  79     if (framesize) {
  80       subptr(rsp, framesize);
  81     }
  82   } else {
  83     subptr(rsp, framesize);
  84 
  85     // Save RBP register now.
  86     framesize -= wordSize;
  87     movptr(Address(rsp, framesize), rbp);






  88     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  89     if (PreserveFramePointer) {
  90       movptr(rbp, rsp);
  91       if (framesize > 0) {
  92         addptr(rbp, framesize);
  93       }
  94     }
  95   }
  96 






  97   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
  98     framesize -= wordSize;
  99     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 100   }
 101 
 102 #ifdef ASSERT
 103   if (VerifyStackAtCalls) {
 104     Label L;
 105     push(rax);
 106     mov(rax, rsp);
 107     andptr(rax, StackAlignmentInBytes-1);
 108     cmpptr(rax, StackAlignmentInBytes-wordSize);
 109     pop(rax);
 110     jcc(Assembler::equal, L);
 111     STOP("Stack is not properly aligned!");
 112     bind(L);
 113   }
 114 #endif

 115 
 116   if (!is_stub) {
 117     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 118     // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 119     Label dummy_slow_path;
 120     Label dummy_continuation;
 121     Label* slow_path = &dummy_slow_path;
 122     Label* continuation = &dummy_continuation;
 123     if (!Compile::current()->output()->in_scratch_emit_size()) {
 124       // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 125       C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 126       Compile::current()->output()->add_stub(stub);
 127       slow_path = &stub->entry();
 128       continuation = &stub->continuation();
 129     }
 130     bs->nmethod_entry_barrier(this, slow_path, continuation);
 131   }

 132 }
 133 
 134 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 135   switch (vlen_in_bytes) {
 136     case  4: // fall-through
 137     case  8: // fall-through
 138     case 16: return Assembler::AVX_128bit;
 139     case 32: return Assembler::AVX_256bit;
 140     case 64: return Assembler::AVX_512bit;
 141 
 142     default: {
 143       ShouldNotReachHere();
 144       return Assembler::AVX_NoVec;
 145     }
 146   }
 147 }
 148 
 149 // fast_lock and fast_unlock used by C2
 150 
 151 // Because the transitions from emitted code to the runtime
 152 // monitorenter/exit helper stubs are so slow it's critical that
 153 // we inline both the stack-locking fast path and the inflated fast path.
 154 //
 155 // See also: cmpFastLock and cmpFastUnlock.
 156 //
 157 // What follows is a specialized inline transliteration of the code
 158 // in enter() and exit(). If we're concerned about I$ bloat another
 159 // option would be to emit TrySlowEnter and TrySlowExit methods
 160 // at startup-time.  These methods would accept arguments as
 161 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 162 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 163 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 164 // In practice, however, the # of lock sites is bounded and is usually small.
 165 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 166 // if the processor uses simple bimodal branch predictors keyed by EIP
 167 // Since the helper routines would be called from multiple synchronization
 168 // sites.
 169 //
 170 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 171 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 172 // to those specialized methods.  That'd give us a mostly platform-independent
 173 // implementation that the JITs could optimize and inline at their pleasure.
 174 // Done correctly, the only time we'd need to cross to native could would be
 175 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 176 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 177 // (b) explicit barriers or fence operations.
 178 //
 179 // TODO:
 180 //
 181 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 182 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 183 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 184 //    the lock operators would typically be faster than reifying Self.
 185 //
 186 // *  Ideally I'd define the primitives as:
 187 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 188 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 189 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 190 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 191 //    Furthermore the register assignments are overconstrained, possibly resulting in
 192 //    sub-optimal code near the synchronization site.
 193 //
 194 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 195 //    Alternately, use a better sp-proximity test.
 196 //
 197 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 198 //    Either one is sufficient to uniquely identify a thread.
 199 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 200 //
 201 // *  Intrinsify notify() and notifyAll() for the common cases where the
 202 //    object is locked by the calling thread but the waitlist is empty.
 203 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 204 //
 205 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 206 //    But beware of excessive branch density on AMD Opterons.
 207 //
 208 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 209 //    or failure of the fast path.  If the fast path fails then we pass
 210 //    control to the slow path, typically in C.  In fast_lock and
 211 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 212 //    will emit a conditional branch immediately after the node.
 213 //    So we have branches to branches and lots of ICC.ZF games.
 214 //    Instead, it might be better to have C2 pass a "FailureLabel"
 215 //    into fast_lock and fast_unlock.  In the case of success, control
 216 //    will drop through the node.  ICC.ZF is undefined at exit.
 217 //    In the case of failure, the node will branch directly to the
 218 //    FailureLabel
 219 
 220 
 221 // obj: object to lock
 222 // box: on-stack box address -- KILLED
 223 // rax: tmp -- KILLED
 224 // t  : tmp -- KILLED
 225 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register rax_reg,
 226                                   Register t, Register thread) {
 227   assert(rax_reg == rax, "Used for CAS");
 228   assert_different_registers(obj, box, rax_reg, t, thread);
 229 
 230   // Handle inflated monitor.
 231   Label inflated;
 232   // Finish fast lock successfully. ZF value is irrelevant.
 233   Label locked;
 234   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 235   Label slow_path;
 236 
 237   if (UseObjectMonitorTable) {
 238     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 239     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 240   }
 241 
 242   if (DiagnoseSyncOnValueBasedClasses != 0) {
 243     load_klass(rax_reg, obj, t);
 244     testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
 245     jcc(Assembler::notZero, slow_path);
 246   }
 247 
 248   const Register mark = t;
 249 
 250   { // Fast Lock
 251 
 252     Label push;
 253 
 254     const Register top = UseObjectMonitorTable ? rax_reg : box;
 255 
 256     // Load the mark.
 257     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 258 
 259     // Prefetch top.
 260     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 261 
 262     // Check for monitor (0b10).
 263     testptr(mark, markWord::monitor_value);
 264     jcc(Assembler::notZero, inflated);
 265 
 266     // Check if lock-stack is full.
 267     cmpl(top, LockStack::end_offset() - 1);
 268     jcc(Assembler::greater, slow_path);
 269 
 270     // Check if recursive.
 271     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 272     jccb(Assembler::equal, push);
 273 
 274     // Try to lock. Transition lock bits 0b01 => 0b00
 275     movptr(rax_reg, mark);
 276     orptr(rax_reg, markWord::unlocked_value);
 277     andptr(mark, ~(int32_t)markWord::unlocked_value);
 278     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 279     jcc(Assembler::notEqual, slow_path);
 280 
 281     if (UseObjectMonitorTable) {
 282       // Need to reload top, clobbered by CAS.
 283       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 284     }
 285     bind(push);
 286     // After successful lock, push object on lock-stack.
 287     movptr(Address(thread, top), obj);
 288     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 289     jmpb(locked);
 290   }
 291 
 292   { // Handle inflated monitor.
 293     bind(inflated);
 294 
 295     const Register monitor = t;
 296 
 297     if (!UseObjectMonitorTable) {
 298       assert(mark == monitor, "should be the same here");
 299     } else {
 300       // Uses ObjectMonitorTable.  Look for the monitor in the om_cache.
 301       // Fetch ObjectMonitor* from the cache or take the slow-path.
 302       Label monitor_found;
 303 
 304       // Load cache address
 305       lea(t, Address(thread, JavaThread::om_cache_oops_offset()));
 306 
 307       const int num_unrolled = 2;
 308       for (int i = 0; i < num_unrolled; i++) {
 309         cmpptr(obj, Address(t));
 310         jccb(Assembler::equal, monitor_found);
 311         increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 312       }
 313 
 314       Label loop;
 315 
 316       // Search for obj in cache.
 317       bind(loop);
 318 
 319       // Check for match.
 320       cmpptr(obj, Address(t));
 321       jccb(Assembler::equal, monitor_found);
 322 
 323       // Search until null encountered, guaranteed _null_sentinel at end.
 324       cmpptr(Address(t), 1);
 325       jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0
 326       increment(t, in_bytes(OMCache::oop_to_oop_difference()));
 327       jmpb(loop);
 328 
 329       // Cache hit.
 330       bind(monitor_found);
 331       movptr(monitor, Address(t, OMCache::oop_to_monitor_difference()));
 332     }
 333     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 334     const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 335     const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
 336 
 337     Label monitor_locked;
 338     // Lock the monitor.
 339 
 340     if (UseObjectMonitorTable) {
 341       // Cache the monitor for unlock before trashing box. On failure to acquire
 342       // the lock, the slow path will reset the entry accordingly (see CacheSetter).
 343       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
 344     }
 345 
 346     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 347     xorptr(rax_reg, rax_reg);
 348     movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
 349     lock(); cmpxchgptr(box, owner_address);
 350     jccb(Assembler::equal, monitor_locked);
 351 
 352     // Check if recursive.
 353     cmpptr(box, rax_reg);
 354     jccb(Assembler::notEqual, slow_path);
 355 
 356     // Recursive.
 357     increment(recursions_address);
 358 
 359     bind(monitor_locked);
 360   }
 361 
 362   bind(locked);
 363   // Set ZF = 1
 364   xorl(rax_reg, rax_reg);
 365 
 366 #ifdef ASSERT
 367   // Check that locked label is reached with ZF set.
 368   Label zf_correct;
 369   Label zf_bad_zero;
 370   jcc(Assembler::zero, zf_correct);
 371   jmp(zf_bad_zero);
 372 #endif
 373 
 374   bind(slow_path);
 375 #ifdef ASSERT
 376   // Check that slow_path label is reached with ZF not set.
 377   jcc(Assembler::notZero, zf_correct);
 378   stop("Fast Lock ZF != 0");
 379   bind(zf_bad_zero);
 380   stop("Fast Lock ZF != 1");
 381   bind(zf_correct);
 382 #endif
 383   // C2 uses the value of ZF to determine the continuation.
 384 }
 385 
 386 // obj: object to lock
 387 // rax: tmp -- KILLED
 388 // t  : tmp - cannot be obj nor rax -- KILLED
 389 //
 390 // Some commentary on balanced locking:
 391 //
 392 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 393 // Methods that don't have provably balanced locking are forced to run in the
 394 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 395 // The interpreter provides two properties:
 396 // I1:  At return-time the interpreter automatically and quietly unlocks any
 397 //      objects acquired in the current activation (frame).  Recall that the
 398 //      interpreter maintains an on-stack list of locks currently held by
 399 //      a frame.
 400 // I2:  If a method attempts to unlock an object that is not held by the
 401 //      frame the interpreter throws IMSX.
 402 //
 403 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 404 // B() doesn't have provably balanced locking so it runs in the interpreter.
 405 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 406 // is still locked by A().
 407 //
 408 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface
 409 // Specification" states that an object locked by JNI's MonitorEnter should not be
 410 // unlocked by "normal" java-level locking and vice-versa.  The specification doesn't
 411 // specify what will occur if a program engages in such mixed-mode locking, however.
 412 // Arguably given that the spec legislates the JNI case as undefined our implementation
 413 // could reasonably *avoid* checking owner in fast_unlock().
 414 // In the interest of performance we elide m->Owner==Self check in unlock.
 415 // A perfectly viable alternative is to elide the owner check except when
 416 // Xcheck:jni is enabled.
 417 
 418 void C2_MacroAssembler::fast_unlock(Register obj, Register reg_rax, Register t, Register thread) {
 419   assert(reg_rax == rax, "Used for CAS");
 420   assert_different_registers(obj, reg_rax, t);
 421 
 422   // Handle inflated monitor.
 423   Label inflated, inflated_check_lock_stack;
 424   // Finish fast unlock successfully.  MUST jump with ZF == 1
 425   Label unlocked, slow_path;
 426 
 427   const Register mark = t;
 428   const Register monitor = t;
 429   const Register top = UseObjectMonitorTable ? t : reg_rax;
 430   const Register box = reg_rax;
 431 
 432   Label dummy;
 433   C2FastUnlockStub* stub = nullptr;
 434 
 435   if (!Compile::current()->output()->in_scratch_emit_size()) {
 436     stub = new (Compile::current()->comp_arena()) C2FastUnlockStub(obj, mark, reg_rax, thread);
 437     Compile::current()->output()->add_stub(stub);
 438   }
 439 
 440   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 441 
 442   { // Fast Unlock
 443 
 444     // Load top.
 445     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 446 
 447     if (!UseObjectMonitorTable) {
 448       // Prefetch mark.
 449       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 450     }
 451 
 452     // Check if obj is top of lock-stack.
 453     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 454     // Top of lock stack was not obj. Must be monitor.
 455     jcc(Assembler::notEqual, inflated_check_lock_stack);
 456 
 457     // Pop lock-stack.
 458     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 459     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 460 
 461     // Check if recursive.
 462     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 463     jcc(Assembler::equal, unlocked);
 464 
 465     // We elide the monitor check, let the CAS fail instead.
 466 
 467     if (UseObjectMonitorTable) {
 468       // Load mark.
 469       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 470     }
 471 
 472     // Try to unlock. Transition lock bits 0b00 => 0b01
 473     movptr(reg_rax, mark);
 474     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
 475     orptr(mark, markWord::unlocked_value);
 476     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 477     jcc(Assembler::notEqual, push_and_slow_path);
 478     jmp(unlocked);
 479   }
 480 
 481 
 482   { // Handle inflated monitor.
 483     bind(inflated_check_lock_stack);
 484 #ifdef ASSERT
 485     Label check_done;
 486     subl(top, oopSize);
 487     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
 488     jcc(Assembler::below, check_done);
 489     cmpptr(obj, Address(thread, top));
 490     jccb(Assembler::notEqual, inflated_check_lock_stack);
 491     stop("Fast Unlock lock on stack");
 492     bind(check_done);
 493     if (UseObjectMonitorTable) {
 494       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 495     }
 496     testptr(mark, markWord::monitor_value);
 497     jccb(Assembler::notZero, inflated);
 498     stop("Fast Unlock not monitor");
 499 #endif
 500 
 501     bind(inflated);
 502 
 503     if (!UseObjectMonitorTable) {
 504       assert(mark == monitor, "should be the same here");
 505     } else {
 506       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
 507       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 508       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
 509       cmpptr(monitor, alignof(ObjectMonitor*));
 510       jcc(Assembler::below, slow_path);
 511     }
 512     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 513     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 514     const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
 515     const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
 516     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
 517 
 518     Label recursive;
 519 
 520     // Check if recursive.
 521     cmpptr(recursions_address, 0);
 522     jccb(Assembler::notZero, recursive);
 523 
 524     // Set owner to null.
 525     // Release to satisfy the JMM
 526     movptr(owner_address, NULL_WORD);
 527     // We need a full fence after clearing owner to avoid stranding.
 528     // StoreLoad achieves this.
 529     membar(StoreLoad);
 530 
 531     // Check if the entry_list is empty.
 532     cmpptr(entry_list_address, NULL_WORD);
 533     jccb(Assembler::zero, unlocked);    // If so we are done.
 534 
 535     // Check if there is a successor.
 536     cmpptr(succ_address, NULL_WORD);
 537     jccb(Assembler::notZero, unlocked); // If so we are done.
 538 
 539     // Save the monitor pointer in the current thread, so we can try to
 540     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 541     if (!UseObjectMonitorTable) {
 542       andptr(monitor, ~(int32_t)markWord::monitor_value);
 543     }
 544     movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
 545 
 546     orl(t, 1); // Fast Unlock ZF = 0
 547     jmpb(slow_path);
 548 
 549     // Recursive unlock.
 550     bind(recursive);
 551     decrement(recursions_address);
 552   }
 553 
 554   bind(unlocked);
 555   xorl(t, t); // Fast Unlock ZF = 1
 556 
 557 #ifdef ASSERT
 558   // Check that unlocked label is reached with ZF set.
 559   Label zf_correct;
 560   Label zf_bad_zero;
 561   jcc(Assembler::zero, zf_correct);
 562   jmp(zf_bad_zero);
 563 #endif
 564 
 565   bind(slow_path);
 566   if (stub != nullptr) {
 567     bind(stub->slow_path_continuation());
 568   }
 569 #ifdef ASSERT
 570   // Check that stub->continuation() label is reached with ZF not set.
 571   jcc(Assembler::notZero, zf_correct);
 572   stop("Fast Unlock ZF != 0");
 573   bind(zf_bad_zero);
 574   stop("Fast Unlock ZF != 1");
 575   bind(zf_correct);
 576 #endif
 577   // C2 uses the value of ZF to determine the continuation.
 578 }
 579 
 580 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
 581   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
 582 }
 583 
 584 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) {
 585   const int framesize = Compile::current()->output()->frame_size_in_bytes();
 586   masm->movptr(dst, rsp);
 587   if (framesize > 2 * wordSize) {
 588     masm->addptr(dst, framesize - 2 * wordSize);
 589   }
 590 }
 591 
 592 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
 593   if (PreserveFramePointer) {
 594     // frame pointer is valid
 595 #ifdef ASSERT
 596     // Verify frame pointer value in rbp.
 597     reconstruct_frame_pointer_helper(this, rtmp);
 598     Label L_success;
 599     cmpq(rbp, rtmp);
 600     jccb(Assembler::equal, L_success);
 601     STOP("frame pointer mismatch");
 602     bind(L_success);
 603 #endif // ASSERT
 604   } else {
 605     reconstruct_frame_pointer_helper(this, rbp);
 606   }
 607 }
 608 
 609 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) {
 610   jint lo = t->_lo;
 611   jint hi = t->_hi;
 612   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi);
 613   if (t == TypeInt::INT) {
 614     return;
 615   }
 616 
 617   BLOCK_COMMENT("CastII {");
 618   Label fail;
 619   Label succeed;
 620 
 621   if (lo != min_jint) {
 622     cmpl(val, lo);
 623     jccb(Assembler::less, fail);
 624   }
 625   if (hi != max_jint) {
 626     cmpl(val, hi);
 627     jccb(Assembler::greater, fail);
 628   }
 629   jmpb(succeed);
 630 
 631   bind(fail);
 632   movl(c_rarg0, idx);
 633   movl(c_rarg1, val);
 634   movl(c_rarg2, lo);
 635   movl(c_rarg3, hi);
 636   reconstruct_frame_pointer(rscratch1);
 637   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range)));
 638   hlt();
 639   bind(succeed);
 640   BLOCK_COMMENT("} // CastII");
 641 }
 642 
 643 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
 644   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
 645 }
 646 
 647 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) {
 648   jlong lo = t->_lo;
 649   jlong hi = t->_hi;
 650   assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi);
 651   if (t == TypeLong::LONG) {
 652     return;
 653   }
 654 
 655   BLOCK_COMMENT("CastLL {");
 656   Label fail;
 657   Label succeed;
 658 
 659   auto cmp_val = [&](jlong bound) {
 660     if (is_simm32(bound)) {
 661       cmpq(val, checked_cast<int>(bound));
 662     } else {
 663       mov64(tmp, bound);
 664       cmpq(val, tmp);
 665     }
 666   };
 667 
 668   if (lo != min_jlong) {
 669     cmp_val(lo);
 670     jccb(Assembler::less, fail);
 671   }
 672   if (hi != max_jlong) {
 673     cmp_val(hi);
 674     jccb(Assembler::greater, fail);
 675   }
 676   jmpb(succeed);
 677 
 678   bind(fail);
 679   movl(c_rarg0, idx);
 680   movq(c_rarg1, val);
 681   mov64(c_rarg2, lo);
 682   mov64(c_rarg3, hi);
 683   reconstruct_frame_pointer(rscratch1);
 684   call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range)));
 685   hlt();
 686   bind(succeed);
 687   BLOCK_COMMENT("} // CastLL");
 688 }
 689 
 690 //-------------------------------------------------------------------------------------------
 691 // Generic instructions support for use in .ad files C2 code generation
 692 
 693 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 694   if (dst != src) {
 695     movdqu(dst, src);
 696   }
 697   if (opcode == Op_AbsVD) {
 698     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 699   } else {
 700     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 701     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 702   }
 703 }
 704 
 705 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 706   if (opcode == Op_AbsVD) {
 707     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 708   } else {
 709     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 710     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 711   }
 712 }
 713 
 714 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 715   if (dst != src) {
 716     movdqu(dst, src);
 717   }
 718   if (opcode == Op_AbsVF) {
 719     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 720   } else {
 721     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 722     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 723   }
 724 }
 725 
 726 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 727   if (opcode == Op_AbsVF) {
 728     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 729   } else {
 730     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 731     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 732   }
 733 }
 734 
 735 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 736   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 737   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 738 
 739   if (opcode == Op_MinV) {
 740     if (elem_bt == T_BYTE) {
 741       pminsb(dst, src);
 742     } else if (elem_bt == T_SHORT) {
 743       pminsw(dst, src);
 744     } else if (elem_bt == T_INT) {
 745       pminsd(dst, src);
 746     } else {
 747       assert(elem_bt == T_LONG, "required");
 748       assert(tmp == xmm0, "required");
 749       assert_different_registers(dst, src, tmp);
 750       movdqu(xmm0, dst);
 751       pcmpgtq(xmm0, src);
 752       blendvpd(dst, src);  // xmm0 as mask
 753     }
 754   } else { // opcode == Op_MaxV
 755     if (elem_bt == T_BYTE) {
 756       pmaxsb(dst, src);
 757     } else if (elem_bt == T_SHORT) {
 758       pmaxsw(dst, src);
 759     } else if (elem_bt == T_INT) {
 760       pmaxsd(dst, src);
 761     } else {
 762       assert(elem_bt == T_LONG, "required");
 763       assert(tmp == xmm0, "required");
 764       assert_different_registers(dst, src, tmp);
 765       movdqu(xmm0, src);
 766       pcmpgtq(xmm0, dst);
 767       blendvpd(dst, src);  // xmm0 as mask
 768     }
 769   }
 770 }
 771 
 772 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 773                                   XMMRegister src1, Address src2, int vlen_enc) {
 774   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 775   if (opcode == Op_UMinV) {
 776     switch(elem_bt) {
 777       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 778       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 779       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 780       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 781       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 782     }
 783   } else {
 784     assert(opcode == Op_UMaxV, "required");
 785     switch(elem_bt) {
 786       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 787       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 788       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 789       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 790       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 791     }
 792   }
 793 }
 794 
 795 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
 796   // For optimality, leverage a full vector width of 512 bits
 797   // for operations over smaller vector sizes on AVX512 targets.
 798   if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 799     if (opcode == Op_UMaxV) {
 800       evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 801     } else {
 802       assert(opcode == Op_UMinV, "required");
 803       evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
 804     }
 805   } else {
 806     // T1 = -1
 807     vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
 808     // T1 = -1 << 63
 809     vpsllq(xtmp1, xtmp1, 63, vlen_enc);
 810     // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
 811     vpaddq(xtmp2, xtmp1, src2, vlen_enc);
 812     // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
 813     vpaddq(xtmp1, xtmp1, src1, vlen_enc);
 814     // Mask = T2 > T1
 815     vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
 816     if (opcode == Op_UMaxV) {
 817       // Res = Mask ? Src2 : Src1
 818       vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
 819     } else {
 820       // Res = Mask ? Src1 : Src2
 821       vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
 822     }
 823   }
 824 }
 825 
 826 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
 827                                   XMMRegister src1, XMMRegister src2, int vlen_enc) {
 828   assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
 829   if (opcode == Op_UMinV) {
 830     switch(elem_bt) {
 831       case T_BYTE:  vpminub(dst, src1, src2, vlen_enc); break;
 832       case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
 833       case T_INT:   vpminud(dst, src1, src2, vlen_enc); break;
 834       case T_LONG:  evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
 835       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 836     }
 837   } else {
 838     assert(opcode == Op_UMaxV, "required");
 839     switch(elem_bt) {
 840       case T_BYTE:  vpmaxub(dst, src1, src2, vlen_enc); break;
 841       case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
 842       case T_INT:   vpmaxud(dst, src1, src2, vlen_enc); break;
 843       case T_LONG:  evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
 844       default: fatal("Unsupported type %s", type2name(elem_bt)); break;
 845     }
 846   }
 847 }
 848 
 849 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 850                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 851                                  int vlen_enc) {
 852   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 853 
 854   if (opcode == Op_MinV) {
 855     if (elem_bt == T_BYTE) {
 856       vpminsb(dst, src1, src2, vlen_enc);
 857     } else if (elem_bt == T_SHORT) {
 858       vpminsw(dst, src1, src2, vlen_enc);
 859     } else if (elem_bt == T_INT) {
 860       vpminsd(dst, src1, src2, vlen_enc);
 861     } else {
 862       assert(elem_bt == T_LONG, "required");
 863       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 864         vpminsq(dst, src1, src2, vlen_enc);
 865       } else {
 866         assert_different_registers(dst, src1, src2);
 867         vpcmpgtq(dst, src1, src2, vlen_enc);
 868         vblendvpd(dst, src1, src2, dst, vlen_enc);
 869       }
 870     }
 871   } else { // opcode == Op_MaxV
 872     if (elem_bt == T_BYTE) {
 873       vpmaxsb(dst, src1, src2, vlen_enc);
 874     } else if (elem_bt == T_SHORT) {
 875       vpmaxsw(dst, src1, src2, vlen_enc);
 876     } else if (elem_bt == T_INT) {
 877       vpmaxsd(dst, src1, src2, vlen_enc);
 878     } else {
 879       assert(elem_bt == T_LONG, "required");
 880       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 881         vpmaxsq(dst, src1, src2, vlen_enc);
 882       } else {
 883         assert_different_registers(dst, src1, src2);
 884         vpcmpgtq(dst, src1, src2, vlen_enc);
 885         vblendvpd(dst, src2, src1, dst, vlen_enc);
 886       }
 887     }
 888   }
 889 }
 890 
 891 // Float/Double min max
 892 
 893 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 894                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 895                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 896                                    int vlen_enc) {
 897   assert(UseAVX > 0, "required");
 898   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 899          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 900   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 901   assert_different_registers(a, tmp, atmp, btmp);
 902   assert_different_registers(b, tmp, atmp, btmp);
 903 
 904   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 905   bool is_double_word = is_double_word_type(elem_bt);
 906 
 907   /* Note on 'non-obvious' assembly sequence:
 908    *
 909    * While there are vminps/vmaxps instructions, there are two important differences between hardware
 910    * and Java on how they handle floats:
 911    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
 912    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
 913    *
 914    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
 915    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
 916    *                (only useful when signs differ, noop otherwise)
 917    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
 918 
 919    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
 920    *   btmp = (b < +0.0) ? a : b
 921    *   atmp = (b < +0.0) ? b : a
 922    *   Tmp  = Max_Float(atmp , btmp)
 923    *   Res  = (atmp == NaN) ? atmp : Tmp
 924    */
 925 
 926   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
 927   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
 928   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
 929   XMMRegister mask;
 930 
 931   if (!is_double_word && is_min) {
 932     mask = a;
 933     vblend = &MacroAssembler::vblendvps;
 934     vmaxmin = &MacroAssembler::vminps;
 935     vcmp = &MacroAssembler::vcmpps;
 936   } else if (!is_double_word && !is_min) {
 937     mask = b;
 938     vblend = &MacroAssembler::vblendvps;
 939     vmaxmin = &MacroAssembler::vmaxps;
 940     vcmp = &MacroAssembler::vcmpps;
 941   } else if (is_double_word && is_min) {
 942     mask = a;
 943     vblend = &MacroAssembler::vblendvpd;
 944     vmaxmin = &MacroAssembler::vminpd;
 945     vcmp = &MacroAssembler::vcmppd;
 946   } else {
 947     assert(is_double_word && !is_min, "sanity");
 948     mask = b;
 949     vblend = &MacroAssembler::vblendvpd;
 950     vmaxmin = &MacroAssembler::vmaxpd;
 951     vcmp = &MacroAssembler::vcmppd;
 952   }
 953 
 954   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
 955   XMMRegister maxmin, scratch;
 956   if (dst == btmp) {
 957     maxmin = btmp;
 958     scratch = tmp;
 959   } else {
 960     maxmin = tmp;
 961     scratch = btmp;
 962   }
 963 
 964   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
 965   if (precompute_mask && !is_double_word) {
 966     vpsrad(tmp, mask, 32, vlen_enc);
 967     mask = tmp;
 968   } else if (precompute_mask && is_double_word) {
 969     vpxor(tmp, tmp, tmp, vlen_enc);
 970     vpcmpgtq(tmp, tmp, mask, vlen_enc);
 971     mask = tmp;
 972   }
 973 
 974   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
 975   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
 976   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
 977   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 978   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
 979 }
 980 
 981 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
 982                                     XMMRegister dst, XMMRegister a, XMMRegister b,
 983                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
 984                                     int vlen_enc) {
 985   assert(UseAVX > 2, "required");
 986   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 987          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 988   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 989   assert_different_registers(dst, a, atmp, btmp);
 990   assert_different_registers(dst, b, atmp, btmp);
 991 
 992   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 993   bool is_double_word = is_double_word_type(elem_bt);
 994   bool merge = true;
 995 
 996   if (!is_double_word && is_min) {
 997     evpmovd2m(ktmp, a, vlen_enc);
 998     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
 999     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1000     vminps(dst, atmp, btmp, vlen_enc);
1001     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1002     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1003   } else if (!is_double_word && !is_min) {
1004     evpmovd2m(ktmp, b, vlen_enc);
1005     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1006     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1007     vmaxps(dst, atmp, btmp, vlen_enc);
1008     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1009     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1010   } else if (is_double_word && is_min) {
1011     evpmovq2m(ktmp, a, vlen_enc);
1012     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1013     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1014     vminpd(dst, atmp, btmp, vlen_enc);
1015     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1016     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1017   } else {
1018     assert(is_double_word && !is_min, "sanity");
1019     evpmovq2m(ktmp, b, vlen_enc);
1020     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1021     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1022     vmaxpd(dst, atmp, btmp, vlen_enc);
1023     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1024     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1025   }
1026 }
1027 
1028 void C2_MacroAssembler::vminmax_fp(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1029                                    XMMRegister src1, XMMRegister src2, int vlen_enc) {
1030   assert(opc == Op_MinV || opc == Op_MinReductionV ||
1031          opc == Op_MaxV || opc == Op_MaxReductionV, "sanity");
1032 
1033   int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN
1034                                                          : AVX10_2_MINMAX_MAX_COMPARE_SIGN;
1035   if (elem_bt == T_FLOAT) {
1036     evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc);
1037   } else {
1038     assert(elem_bt == T_DOUBLE, "");
1039     evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc);
1040   }
1041 }
1042 
1043 // Float/Double signum
1044 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1045   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1046 
1047   Label DONE_LABEL;
1048 
1049   // Handle special cases +0.0/-0.0 and NaN, if argument is +0.0/-0.0 or NaN, return argument
1050   // If AVX10.2 (or newer) floating point comparison instructions used, SF=1 for equal and unordered cases
1051   // If other floating point comparison instructions used, ZF=1 for equal and unordered cases
1052   if (opcode == Op_SignumF) {
1053     if (VM_Version::supports_avx10_2()) {
1054       vucomxss(dst, zero);
1055       jcc(Assembler::negative, DONE_LABEL);
1056     } else {
1057       ucomiss(dst, zero);
1058       jcc(Assembler::equal, DONE_LABEL);
1059     }
1060     movflt(dst, one);
1061     jcc(Assembler::above, DONE_LABEL);
1062     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1063   } else if (opcode == Op_SignumD) {
1064     if (VM_Version::supports_avx10_2()) {
1065       vucomxsd(dst, zero);
1066       jcc(Assembler::negative, DONE_LABEL);
1067     } else {
1068       ucomisd(dst, zero);
1069       jcc(Assembler::equal, DONE_LABEL);
1070     }
1071     movdbl(dst, one);
1072     jcc(Assembler::above, DONE_LABEL);
1073     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1074   }
1075 
1076   bind(DONE_LABEL);
1077 }
1078 
1079 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1080   if (sign) {
1081     pmovsxbw(dst, src);
1082   } else {
1083     pmovzxbw(dst, src);
1084   }
1085 }
1086 
1087 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1088   if (sign) {
1089     vpmovsxbw(dst, src, vector_len);
1090   } else {
1091     vpmovzxbw(dst, src, vector_len);
1092   }
1093 }
1094 
1095 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1096   if (sign) {
1097     vpmovsxbd(dst, src, vector_len);
1098   } else {
1099     vpmovzxbd(dst, src, vector_len);
1100   }
1101 }
1102 
1103 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1104   if (sign) {
1105     vpmovsxwd(dst, src, vector_len);
1106   } else {
1107     vpmovzxwd(dst, src, vector_len);
1108   }
1109 }
1110 
1111 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1112                                      int shift, int vector_len) {
1113   if (opcode == Op_RotateLeftV) {
1114     if (etype == T_INT) {
1115       evprold(dst, src, shift, vector_len);
1116     } else {
1117       assert(etype == T_LONG, "expected type T_LONG");
1118       evprolq(dst, src, shift, vector_len);
1119     }
1120   } else {
1121     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1122     if (etype == T_INT) {
1123       evprord(dst, src, shift, vector_len);
1124     } else {
1125       assert(etype == T_LONG, "expected type T_LONG");
1126       evprorq(dst, src, shift, vector_len);
1127     }
1128   }
1129 }
1130 
1131 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1132                                      XMMRegister shift, int vector_len) {
1133   if (opcode == Op_RotateLeftV) {
1134     if (etype == T_INT) {
1135       evprolvd(dst, src, shift, vector_len);
1136     } else {
1137       assert(etype == T_LONG, "expected type T_LONG");
1138       evprolvq(dst, src, shift, vector_len);
1139     }
1140   } else {
1141     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1142     if (etype == T_INT) {
1143       evprorvd(dst, src, shift, vector_len);
1144     } else {
1145       assert(etype == T_LONG, "expected type T_LONG");
1146       evprorvq(dst, src, shift, vector_len);
1147     }
1148   }
1149 }
1150 
1151 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1152   if (opcode == Op_RShiftVI) {
1153     psrad(dst, shift);
1154   } else if (opcode == Op_LShiftVI) {
1155     pslld(dst, shift);
1156   } else {
1157     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1158     psrld(dst, shift);
1159   }
1160 }
1161 
1162 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1163   switch (opcode) {
1164     case Op_RShiftVI:  psrad(dst, shift); break;
1165     case Op_LShiftVI:  pslld(dst, shift); break;
1166     case Op_URShiftVI: psrld(dst, shift); break;
1167 
1168     default: assert(false, "%s", NodeClassNames[opcode]);
1169   }
1170 }
1171 
1172 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1173   if (opcode == Op_RShiftVI) {
1174     vpsrad(dst, nds, shift, vector_len);
1175   } else if (opcode == Op_LShiftVI) {
1176     vpslld(dst, nds, shift, vector_len);
1177   } else {
1178     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1179     vpsrld(dst, nds, shift, vector_len);
1180   }
1181 }
1182 
1183 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1184   switch (opcode) {
1185     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1186     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1187     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1188 
1189     default: assert(false, "%s", NodeClassNames[opcode]);
1190   }
1191 }
1192 
1193 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1194   switch (opcode) {
1195     case Op_RShiftVB:  // fall-through
1196     case Op_RShiftVS:  psraw(dst, shift); break;
1197 
1198     case Op_LShiftVB:  // fall-through
1199     case Op_LShiftVS:  psllw(dst, shift);   break;
1200 
1201     case Op_URShiftVS: // fall-through
1202     case Op_URShiftVB: psrlw(dst, shift);  break;
1203 
1204     default: assert(false, "%s", NodeClassNames[opcode]);
1205   }
1206 }
1207 
1208 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1209   switch (opcode) {
1210     case Op_RShiftVB:  // fall-through
1211     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1212 
1213     case Op_LShiftVB:  // fall-through
1214     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1215 
1216     case Op_URShiftVS: // fall-through
1217     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1218 
1219     default: assert(false, "%s", NodeClassNames[opcode]);
1220   }
1221 }
1222 
1223 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1224   switch (opcode) {
1225     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1226     case Op_LShiftVL:  psllq(dst, shift); break;
1227     case Op_URShiftVL: psrlq(dst, shift); break;
1228 
1229     default: assert(false, "%s", NodeClassNames[opcode]);
1230   }
1231 }
1232 
1233 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1234   if (opcode == Op_RShiftVL) {
1235     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1236   } else if (opcode == Op_LShiftVL) {
1237     psllq(dst, shift);
1238   } else {
1239     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1240     psrlq(dst, shift);
1241   }
1242 }
1243 
1244 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1245   switch (opcode) {
1246     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1247     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1248     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1249 
1250     default: assert(false, "%s", NodeClassNames[opcode]);
1251   }
1252 }
1253 
1254 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1255   if (opcode == Op_RShiftVL) {
1256     evpsraq(dst, nds, shift, vector_len);
1257   } else if (opcode == Op_LShiftVL) {
1258     vpsllq(dst, nds, shift, vector_len);
1259   } else {
1260     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1261     vpsrlq(dst, nds, shift, vector_len);
1262   }
1263 }
1264 
1265 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1266   switch (opcode) {
1267     case Op_RShiftVB:  // fall-through
1268     case Op_RShiftVS:  // fall-through
1269     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1270 
1271     case Op_LShiftVB:  // fall-through
1272     case Op_LShiftVS:  // fall-through
1273     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1274 
1275     case Op_URShiftVB: // fall-through
1276     case Op_URShiftVS: // fall-through
1277     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1278 
1279     default: assert(false, "%s", NodeClassNames[opcode]);
1280   }
1281 }
1282 
1283 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1284   switch (opcode) {
1285     case Op_RShiftVB:  // fall-through
1286     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1287 
1288     case Op_LShiftVB:  // fall-through
1289     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1290 
1291     case Op_URShiftVB: // fall-through
1292     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1293 
1294     default: assert(false, "%s", NodeClassNames[opcode]);
1295   }
1296 }
1297 
1298 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1299   assert(UseAVX >= 2, "required");
1300   switch (opcode) {
1301     case Op_RShiftVL: {
1302       if (UseAVX > 2) {
1303         assert(tmp == xnoreg, "not used");
1304         if (!VM_Version::supports_avx512vl()) {
1305           vlen_enc = Assembler::AVX_512bit;
1306         }
1307         evpsravq(dst, src, shift, vlen_enc);
1308       } else {
1309         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1310         vpsrlvq(dst, src, shift, vlen_enc);
1311         vpsrlvq(tmp, tmp, shift, vlen_enc);
1312         vpxor(dst, dst, tmp, vlen_enc);
1313         vpsubq(dst, dst, tmp, vlen_enc);
1314       }
1315       break;
1316     }
1317     case Op_LShiftVL: {
1318       assert(tmp == xnoreg, "not used");
1319       vpsllvq(dst, src, shift, vlen_enc);
1320       break;
1321     }
1322     case Op_URShiftVL: {
1323       assert(tmp == xnoreg, "not used");
1324       vpsrlvq(dst, src, shift, vlen_enc);
1325       break;
1326     }
1327     default: assert(false, "%s", NodeClassNames[opcode]);
1328   }
1329 }
1330 
1331 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1332 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1333   assert(opcode == Op_LShiftVB ||
1334          opcode == Op_RShiftVB ||
1335          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1336   bool sign = (opcode != Op_URShiftVB);
1337   assert(vector_len == 0, "required");
1338   vextendbd(sign, dst, src, 1);
1339   vpmovzxbd(vtmp, shift, 1);
1340   varshiftd(opcode, dst, dst, vtmp, 1);
1341   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1342   vextracti128_high(vtmp, dst);
1343   vpackusdw(dst, dst, vtmp, 0);
1344 }
1345 
1346 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1347 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1348   assert(opcode == Op_LShiftVB ||
1349          opcode == Op_RShiftVB ||
1350          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1351   bool sign = (opcode != Op_URShiftVB);
1352   int ext_vector_len = vector_len + 1;
1353   vextendbw(sign, dst, src, ext_vector_len);
1354   vpmovzxbw(vtmp, shift, ext_vector_len);
1355   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1356   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1357   if (vector_len == 0) {
1358     vextracti128_high(vtmp, dst);
1359     vpackuswb(dst, dst, vtmp, vector_len);
1360   } else {
1361     vextracti64x4_high(vtmp, dst);
1362     vpackuswb(dst, dst, vtmp, vector_len);
1363     vpermq(dst, dst, 0xD8, vector_len);
1364   }
1365 }
1366 
1367 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1368   switch(typ) {
1369     case T_BYTE:
1370       pinsrb(dst, val, idx);
1371       break;
1372     case T_SHORT:
1373       pinsrw(dst, val, idx);
1374       break;
1375     case T_INT:
1376       pinsrd(dst, val, idx);
1377       break;
1378     case T_LONG:
1379       pinsrq(dst, val, idx);
1380       break;
1381     default:
1382       assert(false,"Should not reach here.");
1383       break;
1384   }
1385 }
1386 
1387 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1388   switch(typ) {
1389     case T_BYTE:
1390       vpinsrb(dst, src, val, idx);
1391       break;
1392     case T_SHORT:
1393       vpinsrw(dst, src, val, idx);
1394       break;
1395     case T_INT:
1396       vpinsrd(dst, src, val, idx);
1397       break;
1398     case T_LONG:
1399       vpinsrq(dst, src, val, idx);
1400       break;
1401     default:
1402       assert(false,"Should not reach here.");
1403       break;
1404   }
1405 }
1406 
1407 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst,
1408                                          Register base, Register idx_base,
1409                                          Register mask, Register mask_idx,
1410                                          Register rtmp, int vlen_enc) {
1411   vpxor(dst, dst, dst, vlen_enc);
1412   if (elem_bt == T_SHORT) {
1413     for (int i = 0; i < 4; i++) {
1414       // dst[i] = mask[i] ? src[idx_base[i]] : 0
1415       Label skip_load;
1416       btq(mask, mask_idx);
1417       jccb(Assembler::carryClear, skip_load);
1418       movl(rtmp, Address(idx_base, i * 4));
1419       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1420       bind(skip_load);
1421       incq(mask_idx);
1422     }
1423   } else {
1424     assert(elem_bt == T_BYTE, "");
1425     for (int i = 0; i < 8; i++) {
1426       // dst[i] = mask[i] ? src[idx_base[i]] : 0
1427       Label skip_load;
1428       btq(mask, mask_idx);
1429       jccb(Assembler::carryClear, skip_load);
1430       movl(rtmp, Address(idx_base, i * 4));
1431       pinsrb(dst, Address(base, rtmp), i);
1432       bind(skip_load);
1433       incq(mask_idx);
1434     }
1435   }
1436 }
1437 
1438 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst,
1439                                   Register base, Register idx_base,
1440                                   Register rtmp, int vlen_enc) {
1441   vpxor(dst, dst, dst, vlen_enc);
1442   if (elem_bt == T_SHORT) {
1443     for (int i = 0; i < 4; i++) {
1444       // dst[i] = src[idx_base[i]]
1445       movl(rtmp, Address(idx_base, i * 4));
1446       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1447     }
1448   } else {
1449     assert(elem_bt == T_BYTE, "");
1450     for (int i = 0; i < 8; i++) {
1451       // dst[i] = src[idx_base[i]]
1452       movl(rtmp, Address(idx_base, i * 4));
1453       pinsrb(dst, Address(base, rtmp), i);
1454     }
1455   }
1456 }
1457 
1458 /*
1459  * Gather using hybrid algorithm, first partially unroll scalar loop
1460  * to accumulate values from gather indices into a quad-word(64bit) slice.
1461  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1462  * permutation to place the slice into appropriate vector lane
1463  * locations in destination vector. Following pseudo code describes the
1464  * algorithm in detail:
1465  *
1466  * DST_VEC = ZERO_VEC
1467  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1468  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1469  * FOREACH_ITER:
1470  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1471  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1472  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1473  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1474  *
1475  * With each iteration, doubleword permute indices (0,1) corresponding
1476  * to gathered quadword gets right shifted by two lane positions.
1477  *
1478  */
1479 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1480                                         Register base, Register idx_base,
1481                                         Register mask, XMMRegister xtmp1,
1482                                         XMMRegister xtmp2, XMMRegister temp_dst,
1483                                         Register rtmp, Register mask_idx,
1484                                         Register length, int vector_len, int vlen_enc) {
1485   Label GATHER8_LOOP;
1486   assert(is_subword_type(elem_ty), "");
1487   movl(length, vector_len);
1488   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1489   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1490   vallones(xtmp2, vlen_enc);
1491   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1492   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1493   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1494 
1495   bind(GATHER8_LOOP);
1496     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1497     if (mask == noreg) {
1498       vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc);
1499     } else {
1500       vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc);
1501     }
1502     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1503     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1504     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1505     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1506     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1507     vpor(dst, dst, temp_dst, vlen_enc);
1508     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1509     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1510     jcc(Assembler::notEqual, GATHER8_LOOP);
1511 }
1512 
1513 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1514   switch(typ) {
1515     case T_INT:
1516       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1517       break;
1518     case T_FLOAT:
1519       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1520       break;
1521     case T_LONG:
1522       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1523       break;
1524     case T_DOUBLE:
1525       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1526       break;
1527     default:
1528       assert(false,"Should not reach here.");
1529       break;
1530   }
1531 }
1532 
1533 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1534   switch(typ) {
1535     case T_INT:
1536       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1537       break;
1538     case T_FLOAT:
1539       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1540       break;
1541     case T_LONG:
1542       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1543       break;
1544     case T_DOUBLE:
1545       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1546       break;
1547     default:
1548       assert(false,"Should not reach here.");
1549       break;
1550   }
1551 }
1552 
1553 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1554   switch(typ) {
1555     case T_INT:
1556       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1557       break;
1558     case T_FLOAT:
1559       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1560       break;
1561     case T_LONG:
1562       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1563       break;
1564     case T_DOUBLE:
1565       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1566       break;
1567     default:
1568       assert(false,"Should not reach here.");
1569       break;
1570   }
1571 }
1572 
1573 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1574   if (vlen_in_bytes <= 16) {
1575     pxor (dst, dst);
1576     psubb(dst, src);
1577     switch (elem_bt) {
1578       case T_BYTE:   /* nothing to do */ break;
1579       case T_SHORT:  pmovsxbw(dst, dst); break;
1580       case T_INT:    pmovsxbd(dst, dst); break;
1581       case T_FLOAT:  pmovsxbd(dst, dst); break;
1582       case T_LONG:   pmovsxbq(dst, dst); break;
1583       case T_DOUBLE: pmovsxbq(dst, dst); break;
1584 
1585       default: assert(false, "%s", type2name(elem_bt));
1586     }
1587   } else {
1588     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1589     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1590 
1591     vpxor (dst, dst, dst, vlen_enc);
1592     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1593 
1594     switch (elem_bt) {
1595       case T_BYTE:   /* nothing to do */            break;
1596       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1597       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1598       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1599       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1600       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1601 
1602       default: assert(false, "%s", type2name(elem_bt));
1603     }
1604   }
1605 }
1606 
1607 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1608   if (novlbwdq) {
1609     vpmovsxbd(xtmp, src, vlen_enc);
1610     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1611             Assembler::eq, true, vlen_enc, noreg);
1612   } else {
1613     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1614     vpsubb(xtmp, xtmp, src, vlen_enc);
1615     evpmovb2m(dst, xtmp, vlen_enc);
1616   }
1617 }
1618 
1619 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1620   if (is_integral_type(bt)) {
1621     switch (vlen_in_bytes) {
1622       case 4:  movdl(dst, src);   break;
1623       case 8:  movq(dst, src);    break;
1624       case 16: movdqu(dst, src);  break;
1625       case 32: vmovdqu(dst, src); break;
1626       case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1627       default: ShouldNotReachHere();
1628     }
1629   } else {
1630     switch (vlen_in_bytes) {
1631       case 4:  movflt(dst, src); break;
1632       case 8:  movdbl(dst, src); break;
1633       case 16: movups(dst, src); break;
1634       case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1635       case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1636       default: ShouldNotReachHere();
1637     }
1638   }
1639 }
1640 
1641 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1642   assert(rscratch != noreg || always_reachable(src), "missing");
1643 
1644   if (reachable(src)) {
1645     load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1646   } else {
1647     lea(rscratch, src);
1648     load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1649   }
1650 }
1651 
1652 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1653   int vlen_enc = vector_length_encoding(vlen);
1654   if (VM_Version::supports_avx()) {
1655     if (bt == T_LONG) {
1656       if (VM_Version::supports_avx2()) {
1657         vpbroadcastq(dst, src, vlen_enc);
1658       } else {
1659         vmovddup(dst, src, vlen_enc);
1660       }
1661     } else if (bt == T_DOUBLE) {
1662       if (vlen_enc != Assembler::AVX_128bit) {
1663         vbroadcastsd(dst, src, vlen_enc, noreg);
1664       } else {
1665         vmovddup(dst, src, vlen_enc);
1666       }
1667     } else {
1668       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1669         vpbroadcastd(dst, src, vlen_enc);
1670       } else {
1671         vbroadcastss(dst, src, vlen_enc);
1672       }
1673     }
1674   } else if (VM_Version::supports_sse3()) {
1675     movddup(dst, src);
1676   } else {
1677     load_vector(bt, dst, src, vlen);
1678   }
1679 }
1680 
1681 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1682   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1683   int offset = exact_log2(type2aelembytes(bt)) << 6;
1684   if (is_floating_point_type(bt)) {
1685     offset += 128;
1686   }
1687   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1688   load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1689 }
1690 
1691 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1692 
1693 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1694   int vector_len = Assembler::AVX_128bit;
1695 
1696   switch (opcode) {
1697     case Op_AndReductionV:  pand(dst, src); break;
1698     case Op_OrReductionV:   por (dst, src); break;
1699     case Op_XorReductionV:  pxor(dst, src); break;
1700     case Op_MinReductionV:
1701       switch (typ) {
1702         case T_BYTE:        pminsb(dst, src); break;
1703         case T_SHORT:       pminsw(dst, src); break;
1704         case T_INT:         pminsd(dst, src); break;
1705         case T_LONG:        assert(UseAVX > 2, "required");
1706                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1707         default:            assert(false, "wrong type");
1708       }
1709       break;
1710     case Op_MaxReductionV:
1711       switch (typ) {
1712         case T_BYTE:        pmaxsb(dst, src); break;
1713         case T_SHORT:       pmaxsw(dst, src); break;
1714         case T_INT:         pmaxsd(dst, src); break;
1715         case T_LONG:        assert(UseAVX > 2, "required");
1716                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1717         default:            assert(false, "wrong type");
1718       }
1719       break;
1720     case Op_AddReductionVF: addss(dst, src); break;
1721     case Op_AddReductionVD: addsd(dst, src); break;
1722     case Op_AddReductionVI:
1723       switch (typ) {
1724         case T_BYTE:        paddb(dst, src); break;
1725         case T_SHORT:       paddw(dst, src); break;
1726         case T_INT:         paddd(dst, src); break;
1727         default:            assert(false, "wrong type");
1728       }
1729       break;
1730     case Op_AddReductionVL: paddq(dst, src); break;
1731     case Op_MulReductionVF: mulss(dst, src); break;
1732     case Op_MulReductionVD: mulsd(dst, src); break;
1733     case Op_MulReductionVI:
1734       switch (typ) {
1735         case T_SHORT:       pmullw(dst, src); break;
1736         case T_INT:         pmulld(dst, src); break;
1737         default:            assert(false, "wrong type");
1738       }
1739       break;
1740     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1741                             evpmullq(dst, dst, src, vector_len); break;
1742     default:                assert(false, "wrong opcode");
1743   }
1744 }
1745 
1746 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1747   switch (opcode) {
1748     case Op_AddReductionVF: addps(dst, src); break;
1749     case Op_AddReductionVD: addpd(dst, src); break;
1750     case Op_MulReductionVF: mulps(dst, src); break;
1751     case Op_MulReductionVD: mulpd(dst, src); break;
1752     default:                assert(false, "%s", NodeClassNames[opcode]);
1753   }
1754 }
1755 
1756 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1757   int vector_len = Assembler::AVX_256bit;
1758 
1759   switch (opcode) {
1760     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1761     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1762     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1763     case Op_MinReductionV:
1764       switch (typ) {
1765         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1766         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1767         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1768         case T_LONG:        assert(UseAVX > 2, "required");
1769                             vpminsq(dst, src1, src2, vector_len); break;
1770         default:            assert(false, "wrong type");
1771       }
1772       break;
1773     case Op_MaxReductionV:
1774       switch (typ) {
1775         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1776         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1777         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1778         case T_LONG:        assert(UseAVX > 2, "required");
1779                             vpmaxsq(dst, src1, src2, vector_len); break;
1780         default:            assert(false, "wrong type");
1781       }
1782       break;
1783     case Op_AddReductionVI:
1784       switch (typ) {
1785         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1786         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1787         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1788         default:            assert(false, "wrong type");
1789       }
1790       break;
1791     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1792     case Op_MulReductionVI:
1793       switch (typ) {
1794         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1795         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1796         default:            assert(false, "wrong type");
1797       }
1798       break;
1799     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1800     default:                assert(false, "wrong opcode");
1801   }
1802 }
1803 
1804 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1805   int vector_len = Assembler::AVX_256bit;
1806 
1807   switch (opcode) {
1808     case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1809     case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1810     case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1811     case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1812     default:                assert(false, "%s", NodeClassNames[opcode]);
1813   }
1814 }
1815 
1816 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1817                                   XMMRegister dst, XMMRegister src,
1818                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1819   switch (opcode) {
1820     case Op_AddReductionVF:
1821     case Op_MulReductionVF:
1822       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1823       break;
1824 
1825     case Op_AddReductionVD:
1826     case Op_MulReductionVD:
1827       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1828       break;
1829 
1830     default: assert(false, "wrong opcode");
1831   }
1832 }
1833 
1834 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1835                                             XMMRegister dst, XMMRegister src,
1836                                             XMMRegister vtmp1, XMMRegister vtmp2) {
1837   switch (opcode) {
1838     case Op_AddReductionVF:
1839     case Op_MulReductionVF:
1840       unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1841       break;
1842 
1843     case Op_AddReductionVD:
1844     case Op_MulReductionVD:
1845       unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1846       break;
1847 
1848     default: assert(false, "%s", NodeClassNames[opcode]);
1849   }
1850 }
1851 
1852 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1853                              Register dst, Register src1, XMMRegister src2,
1854                              XMMRegister vtmp1, XMMRegister vtmp2) {
1855   switch (vlen) {
1856     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1857     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1858     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1859     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1860 
1861     default: assert(false, "wrong vector length");
1862   }
1863 }
1864 
1865 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1866                              Register dst, Register src1, XMMRegister src2,
1867                              XMMRegister vtmp1, XMMRegister vtmp2) {
1868   switch (vlen) {
1869     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1870     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1871     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1872     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1873 
1874     default: assert(false, "wrong vector length");
1875   }
1876 }
1877 
1878 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1879                              Register dst, Register src1, XMMRegister src2,
1880                              XMMRegister vtmp1, XMMRegister vtmp2) {
1881   switch (vlen) {
1882     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1883     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1884     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1885     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1886 
1887     default: assert(false, "wrong vector length");
1888   }
1889 }
1890 
1891 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1892                              Register dst, Register src1, XMMRegister src2,
1893                              XMMRegister vtmp1, XMMRegister vtmp2) {
1894   switch (vlen) {
1895     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1896     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1897     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1898     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1899 
1900     default: assert(false, "wrong vector length");
1901   }
1902 }
1903 
1904 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1905                              Register dst, Register src1, XMMRegister src2,
1906                              XMMRegister vtmp1, XMMRegister vtmp2) {
1907   switch (vlen) {
1908     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1909     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1910     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1911 
1912     default: assert(false, "wrong vector length");
1913   }
1914 }
1915 
1916 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1917   switch (vlen) {
1918     case 2:
1919       assert(vtmp2 == xnoreg, "");
1920       reduce2F(opcode, dst, src, vtmp1);
1921       break;
1922     case 4:
1923       assert(vtmp2 == xnoreg, "");
1924       reduce4F(opcode, dst, src, vtmp1);
1925       break;
1926     case 8:
1927       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1928       break;
1929     case 16:
1930       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1931       break;
1932     default: assert(false, "wrong vector length");
1933   }
1934 }
1935 
1936 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1937   switch (vlen) {
1938     case 2:
1939       assert(vtmp2 == xnoreg, "");
1940       reduce2D(opcode, dst, src, vtmp1);
1941       break;
1942     case 4:
1943       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1944       break;
1945     case 8:
1946       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1947       break;
1948     default: assert(false, "wrong vector length");
1949   }
1950 }
1951 
1952 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1953   switch (vlen) {
1954     case 2:
1955       assert(vtmp1 == xnoreg, "");
1956       assert(vtmp2 == xnoreg, "");
1957       unorderedReduce2F(opcode, dst, src);
1958       break;
1959     case 4:
1960       assert(vtmp2 == xnoreg, "");
1961       unorderedReduce4F(opcode, dst, src, vtmp1);
1962       break;
1963     case 8:
1964       unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
1965       break;
1966     case 16:
1967       unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
1968       break;
1969     default: assert(false, "wrong vector length");
1970   }
1971 }
1972 
1973 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1974   switch (vlen) {
1975     case 2:
1976       assert(vtmp1 == xnoreg, "");
1977       assert(vtmp2 == xnoreg, "");
1978       unorderedReduce2D(opcode, dst, src);
1979       break;
1980     case 4:
1981       assert(vtmp2 == xnoreg, "");
1982       unorderedReduce4D(opcode, dst, src, vtmp1);
1983       break;
1984     case 8:
1985       unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
1986       break;
1987     default: assert(false, "wrong vector length");
1988   }
1989 }
1990 
1991 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1992   if (opcode == Op_AddReductionVI) {
1993     if (vtmp1 != src2) {
1994       movdqu(vtmp1, src2);
1995     }
1996     phaddd(vtmp1, vtmp1);
1997   } else {
1998     pshufd(vtmp1, src2, 0x1);
1999     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2000   }
2001   movdl(vtmp2, src1);
2002   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2003   movdl(dst, vtmp1);
2004 }
2005 
2006 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2007   if (opcode == Op_AddReductionVI) {
2008     if (vtmp1 != src2) {
2009       movdqu(vtmp1, src2);
2010     }
2011     phaddd(vtmp1, src2);
2012     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2013   } else {
2014     pshufd(vtmp2, src2, 0xE);
2015     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2016     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2017   }
2018 }
2019 
2020 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2021   if (opcode == Op_AddReductionVI) {
2022     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2023     vextracti128_high(vtmp2, vtmp1);
2024     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2025     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2026   } else {
2027     vextracti128_high(vtmp1, src2);
2028     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2029     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2030   }
2031 }
2032 
2033 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2034   vextracti64x4_high(vtmp2, src2);
2035   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2036   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2037 }
2038 
2039 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2040   pshufd(vtmp2, src2, 0x1);
2041   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2042   movdqu(vtmp1, vtmp2);
2043   psrldq(vtmp1, 2);
2044   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2045   movdqu(vtmp2, vtmp1);
2046   psrldq(vtmp2, 1);
2047   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2048   movdl(vtmp2, src1);
2049   pmovsxbd(vtmp1, vtmp1);
2050   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2051   pextrb(dst, vtmp1, 0x0);
2052   movsbl(dst, dst);
2053 }
2054 
2055 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2056   pshufd(vtmp1, src2, 0xE);
2057   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2058   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2059 }
2060 
2061 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2062   vextracti128_high(vtmp2, src2);
2063   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2064   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2065 }
2066 
2067 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2068   vextracti64x4_high(vtmp1, src2);
2069   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2070   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2071 }
2072 
2073 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2074   pmovsxbw(vtmp2, src2);
2075   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2076 }
2077 
2078 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2079   if (UseAVX > 1) {
2080     int vector_len = Assembler::AVX_256bit;
2081     vpmovsxbw(vtmp1, src2, vector_len);
2082     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2083   } else {
2084     pmovsxbw(vtmp2, src2);
2085     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2086     pshufd(vtmp2, src2, 0x1);
2087     pmovsxbw(vtmp2, src2);
2088     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2089   }
2090 }
2091 
2092 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2093   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2094     int vector_len = Assembler::AVX_512bit;
2095     vpmovsxbw(vtmp1, src2, vector_len);
2096     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2097   } else {
2098     assert(UseAVX >= 2,"Should not reach here.");
2099     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2100     vextracti128_high(vtmp2, src2);
2101     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2102   }
2103 }
2104 
2105 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2106   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2107   vextracti64x4_high(vtmp2, src2);
2108   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2109 }
2110 
2111 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2112   if (opcode == Op_AddReductionVI) {
2113     if (vtmp1 != src2) {
2114       movdqu(vtmp1, src2);
2115     }
2116     phaddw(vtmp1, vtmp1);
2117     phaddw(vtmp1, vtmp1);
2118   } else {
2119     pshufd(vtmp2, src2, 0x1);
2120     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2121     movdqu(vtmp1, vtmp2);
2122     psrldq(vtmp1, 2);
2123     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2124   }
2125   movdl(vtmp2, src1);
2126   pmovsxwd(vtmp1, vtmp1);
2127   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2128   pextrw(dst, vtmp1, 0x0);
2129   movswl(dst, dst);
2130 }
2131 
2132 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2133   if (opcode == Op_AddReductionVI) {
2134     if (vtmp1 != src2) {
2135       movdqu(vtmp1, src2);
2136     }
2137     phaddw(vtmp1, src2);
2138   } else {
2139     pshufd(vtmp1, src2, 0xE);
2140     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2141   }
2142   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2143 }
2144 
2145 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2146   if (opcode == Op_AddReductionVI) {
2147     int vector_len = Assembler::AVX_256bit;
2148     vphaddw(vtmp2, src2, src2, vector_len);
2149     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2150   } else {
2151     vextracti128_high(vtmp2, src2);
2152     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2153   }
2154   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2155 }
2156 
2157 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2158   int vector_len = Assembler::AVX_256bit;
2159   vextracti64x4_high(vtmp1, src2);
2160   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2161   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2162 }
2163 
2164 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2165   pshufd(vtmp2, src2, 0xE);
2166   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2167   movdq(vtmp1, src1);
2168   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2169   movdq(dst, vtmp1);
2170 }
2171 
2172 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2173   vextracti128_high(vtmp1, src2);
2174   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2175   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2176 }
2177 
2178 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2179   vextracti64x4_high(vtmp2, src2);
2180   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2181   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2182 }
2183 
2184 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2185   mov64(temp, -1L);
2186   bzhiq(temp, temp, len);
2187   kmovql(dst, temp);
2188 }
2189 
2190 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2191   reduce_operation_128(T_FLOAT, opcode, dst, src);
2192   pshufd(vtmp, src, 0x1);
2193   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2194 }
2195 
2196 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2197   reduce2F(opcode, dst, src, vtmp);
2198   pshufd(vtmp, src, 0x2);
2199   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2200   pshufd(vtmp, src, 0x3);
2201   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2202 }
2203 
2204 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2205   reduce4F(opcode, dst, src, vtmp2);
2206   vextractf128_high(vtmp2, src);
2207   reduce4F(opcode, dst, vtmp2, vtmp1);
2208 }
2209 
2210 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2211   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2212   vextracti64x4_high(vtmp1, src);
2213   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2214 }
2215 
2216 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2217   pshufd(dst, src, 0x1);
2218   reduce_operation_128(T_FLOAT, opcode, dst, src);
2219 }
2220 
2221 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2222   pshufd(vtmp, src, 0xE);
2223   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2224   unorderedReduce2F(opcode, dst, vtmp);
2225 }
2226 
2227 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2228   vextractf128_high(vtmp1, src);
2229   unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2230   unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2231 }
2232 
2233 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2234   vextractf64x4_high(vtmp2, src);
2235   unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2236   unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2237 }
2238 
2239 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2240   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2241   pshufd(vtmp, src, 0xE);
2242   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2243 }
2244 
2245 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2246   reduce2D(opcode, dst, src, vtmp2);
2247   vextractf128_high(vtmp2, src);
2248   reduce2D(opcode, dst, vtmp2, vtmp1);
2249 }
2250 
2251 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2252   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2253   vextracti64x4_high(vtmp1, src);
2254   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2255 }
2256 
2257 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2258   pshufd(dst, src, 0xE);
2259   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2260 }
2261 
2262 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2263   vextractf128_high(vtmp, src);
2264   unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2265   unorderedReduce2D(opcode, dst, vtmp);
2266 }
2267 
2268 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2269   vextractf64x4_high(vtmp2, src);
2270   unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2271   unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2272 }
2273 
2274 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2275   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2276 }
2277 
2278 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2279   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2280 }
2281 
2282 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2283   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2284 }
2285 
2286 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2287                                  int vec_enc) {
2288   switch(elem_bt) {
2289     case T_INT:
2290     case T_FLOAT:
2291       vmaskmovps(dst, src, mask, vec_enc);
2292       break;
2293     case T_LONG:
2294     case T_DOUBLE:
2295       vmaskmovpd(dst, src, mask, vec_enc);
2296       break;
2297     default:
2298       fatal("Unsupported type %s", type2name(elem_bt));
2299       break;
2300   }
2301 }
2302 
2303 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2304                                  int vec_enc) {
2305   switch(elem_bt) {
2306     case T_INT:
2307     case T_FLOAT:
2308       vmaskmovps(dst, src, mask, vec_enc);
2309       break;
2310     case T_LONG:
2311     case T_DOUBLE:
2312       vmaskmovpd(dst, src, mask, vec_enc);
2313       break;
2314     default:
2315       fatal("Unsupported type %s", type2name(elem_bt));
2316       break;
2317   }
2318 }
2319 
2320 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2321                                           XMMRegister dst, XMMRegister src,
2322                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2323                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2324   const int permconst[] = {1, 14};
2325   XMMRegister wsrc = src;
2326   XMMRegister wdst = xmm_0;
2327   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2328 
2329   int vlen_enc = Assembler::AVX_128bit;
2330   if (vlen == 16) {
2331     vlen_enc = Assembler::AVX_256bit;
2332   }
2333 
2334   for (int i = log2(vlen) - 1; i >=0; i--) {
2335     if (i == 0 && !is_dst_valid) {
2336       wdst = dst;
2337     }
2338     if (i == 3) {
2339       vextracti64x4_high(wtmp, wsrc);
2340     } else if (i == 2) {
2341       vextracti128_high(wtmp, wsrc);
2342     } else { // i = [0,1]
2343       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2344     }
2345 
2346     if (VM_Version::supports_avx10_2()) {
2347       vminmax_fp(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc);
2348     } else {
2349       vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2350     }
2351     wsrc = wdst;
2352     vlen_enc = Assembler::AVX_128bit;
2353   }
2354   if (is_dst_valid) {
2355     if (VM_Version::supports_avx10_2()) {
2356       vminmax_fp(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit);
2357     } else {
2358       vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2359     }
2360   }
2361 }
2362 
2363 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2364                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2365                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2366   XMMRegister wsrc = src;
2367   XMMRegister wdst = xmm_0;
2368   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2369   int vlen_enc = Assembler::AVX_128bit;
2370   if (vlen == 8) {
2371     vlen_enc = Assembler::AVX_256bit;
2372   }
2373   for (int i = log2(vlen) - 1; i >=0; i--) {
2374     if (i == 0 && !is_dst_valid) {
2375       wdst = dst;
2376     }
2377     if (i == 1) {
2378       vextracti128_high(wtmp, wsrc);
2379     } else if (i == 2) {
2380       vextracti64x4_high(wtmp, wsrc);
2381     } else {
2382       assert(i == 0, "%d", i);
2383       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2384     }
2385 
2386     if (VM_Version::supports_avx10_2()) {
2387       vminmax_fp(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc);
2388     } else {
2389       vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2390     }
2391 
2392     wsrc = wdst;
2393     vlen_enc = Assembler::AVX_128bit;
2394   }
2395 
2396   if (is_dst_valid) {
2397     if (VM_Version::supports_avx10_2()) {
2398       vminmax_fp(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit);
2399     } else {
2400       vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2401     }
2402   }
2403 }
2404 
2405 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2406   switch (bt) {
2407     case T_BYTE:  pextrb(dst, src, idx); break;
2408     case T_SHORT: pextrw(dst, src, idx); break;
2409     case T_INT:   pextrd(dst, src, idx); break;
2410     case T_LONG:  pextrq(dst, src, idx); break;
2411 
2412     default:
2413       assert(false,"Should not reach here.");
2414       break;
2415   }
2416 }
2417 
2418 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2419   int esize =  type2aelembytes(typ);
2420   int elem_per_lane = 16/esize;
2421   int lane = elemindex / elem_per_lane;
2422   int eindex = elemindex % elem_per_lane;
2423 
2424   if (lane >= 2) {
2425     assert(UseAVX > 2, "required");
2426     vextractf32x4(dst, src, lane & 3);
2427     return dst;
2428   } else if (lane > 0) {
2429     assert(UseAVX > 0, "required");
2430     vextractf128(dst, src, lane);
2431     return dst;
2432   } else {
2433     return src;
2434   }
2435 }
2436 
2437 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2438   if (typ == T_BYTE) {
2439     movsbl(dst, dst);
2440   } else if (typ == T_SHORT) {
2441     movswl(dst, dst);
2442   }
2443 }
2444 
2445 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2446   int esize =  type2aelembytes(typ);
2447   int elem_per_lane = 16/esize;
2448   int eindex = elemindex % elem_per_lane;
2449   assert(is_integral_type(typ),"required");
2450 
2451   if (eindex == 0) {
2452     if (typ == T_LONG) {
2453       movq(dst, src);
2454     } else {
2455       movdl(dst, src);
2456       movsxl(typ, dst);
2457     }
2458   } else {
2459     extract(typ, dst, src, eindex);
2460     movsxl(typ, dst);
2461   }
2462 }
2463 
2464 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2465   int esize =  type2aelembytes(typ);
2466   int elem_per_lane = 16/esize;
2467   int eindex = elemindex % elem_per_lane;
2468   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2469 
2470   if (eindex == 0) {
2471     movq(dst, src);
2472   } else {
2473     if (typ == T_FLOAT) {
2474       if (UseAVX == 0) {
2475         movdqu(dst, src);
2476         shufps(dst, dst, eindex);
2477       } else {
2478         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2479       }
2480     } else {
2481       if (UseAVX == 0) {
2482         movdqu(dst, src);
2483         psrldq(dst, eindex*esize);
2484       } else {
2485         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2486       }
2487       movq(dst, dst);
2488     }
2489   }
2490   // Zero upper bits
2491   if (typ == T_FLOAT) {
2492     if (UseAVX == 0) {
2493       assert(vtmp != xnoreg, "required.");
2494       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2495       pand(dst, vtmp);
2496     } else {
2497       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2498     }
2499   }
2500 }
2501 
2502 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2503   switch(typ) {
2504     case T_BYTE:
2505     case T_BOOLEAN:
2506       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2507       break;
2508     case T_SHORT:
2509     case T_CHAR:
2510       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2511       break;
2512     case T_INT:
2513     case T_FLOAT:
2514       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2515       break;
2516     case T_LONG:
2517     case T_DOUBLE:
2518       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2519       break;
2520     default:
2521       assert(false,"Should not reach here.");
2522       break;
2523   }
2524 }
2525 
2526 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2527   assert(rscratch != noreg || always_reachable(src2), "missing");
2528 
2529   switch(typ) {
2530     case T_BOOLEAN:
2531     case T_BYTE:
2532       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2533       break;
2534     case T_CHAR:
2535     case T_SHORT:
2536       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2537       break;
2538     case T_INT:
2539     case T_FLOAT:
2540       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2541       break;
2542     case T_LONG:
2543     case T_DOUBLE:
2544       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2545       break;
2546     default:
2547       assert(false,"Should not reach here.");
2548       break;
2549   }
2550 }
2551 
2552 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2553   switch(typ) {
2554     case T_BYTE:
2555       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2556       break;
2557     case T_SHORT:
2558       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2559       break;
2560     case T_INT:
2561     case T_FLOAT:
2562       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2563       break;
2564     case T_LONG:
2565     case T_DOUBLE:
2566       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2567       break;
2568     default:
2569       assert(false,"Should not reach here.");
2570       break;
2571   }
2572 }
2573 
2574 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2575   assert(vlen_in_bytes <= 32, "");
2576   int esize = type2aelembytes(bt);
2577   if (vlen_in_bytes == 32) {
2578     assert(vtmp == xnoreg, "required.");
2579     if (esize >= 4) {
2580       vtestps(src1, src2, AVX_256bit);
2581     } else {
2582       vptest(src1, src2, AVX_256bit);
2583     }
2584     return;
2585   }
2586   if (vlen_in_bytes < 16) {
2587     // Duplicate the lower part to fill the whole register,
2588     // Don't need to do so for src2
2589     assert(vtmp != xnoreg, "required");
2590     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2591     pshufd(vtmp, src1, shuffle_imm);
2592   } else {
2593     assert(vtmp == xnoreg, "required");
2594     vtmp = src1;
2595   }
2596   if (esize >= 4 && VM_Version::supports_avx()) {
2597     vtestps(vtmp, src2, AVX_128bit);
2598   } else {
2599     ptest(vtmp, src2);
2600   }
2601 }
2602 
2603 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2604 #ifdef ASSERT
2605   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2606   bool is_bw_supported = VM_Version::supports_avx512bw();
2607   if (is_bw && !is_bw_supported) {
2608     assert(vlen_enc != Assembler::AVX_512bit, "required");
2609     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2610            "XMM register should be 0-15");
2611   }
2612 #endif // ASSERT
2613   switch (elem_bt) {
2614     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2615     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2616     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2617     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2618     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2619     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2620     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2621   }
2622 }
2623 
2624 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2625   assert(UseAVX >= 2, "required");
2626   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2627   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2628   if ((UseAVX > 2) &&
2629       (!is_bw || VM_Version::supports_avx512bw()) &&
2630       (!is_vl || VM_Version::supports_avx512vl())) {
2631     switch (elem_bt) {
2632       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2633       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2634       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2635       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2636       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2637     }
2638   } else {
2639     assert(vlen_enc != Assembler::AVX_512bit, "required");
2640     assert((dst->encoding() < 16),"XMM register should be 0-15");
2641     switch (elem_bt) {
2642       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2643       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2644       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2645       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2646       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2647       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2648       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2649     }
2650   }
2651 }
2652 
2653 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2654   switch (to_elem_bt) {
2655     case T_SHORT:
2656       vpmovsxbw(dst, src, vlen_enc);
2657       break;
2658     case T_INT:
2659       vpmovsxbd(dst, src, vlen_enc);
2660       break;
2661     case T_FLOAT:
2662       vpmovsxbd(dst, src, vlen_enc);
2663       vcvtdq2ps(dst, dst, vlen_enc);
2664       break;
2665     case T_LONG:
2666       vpmovsxbq(dst, src, vlen_enc);
2667       break;
2668     case T_DOUBLE: {
2669       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2670       vpmovsxbd(dst, src, mid_vlen_enc);
2671       vcvtdq2pd(dst, dst, vlen_enc);
2672       break;
2673     }
2674     default:
2675       fatal("Unsupported type %s", type2name(to_elem_bt));
2676       break;
2677   }
2678 }
2679 
2680 //-------------------------------------------------------------------------------------------
2681 
2682 // IndexOf for constant substrings with size >= 8 chars
2683 // which don't need to be loaded through stack.
2684 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2685                                          Register cnt1, Register cnt2,
2686                                          int int_cnt2,  Register result,
2687                                          XMMRegister vec, Register tmp,
2688                                          int ae) {
2689   ShortBranchVerifier sbv(this);
2690   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2691   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2692 
2693   // This method uses the pcmpestri instruction with bound registers
2694   //   inputs:
2695   //     xmm - substring
2696   //     rax - substring length (elements count)
2697   //     mem - scanned string
2698   //     rdx - string length (elements count)
2699   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2700   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2701   //   outputs:
2702   //     rcx - matched index in string
2703   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2704   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2705   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2706   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2707   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2708 
2709   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2710         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2711         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2712 
2713   // Note, inline_string_indexOf() generates checks:
2714   // if (substr.count > string.count) return -1;
2715   // if (substr.count == 0) return 0;
2716   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2717 
2718   // Load substring.
2719   if (ae == StrIntrinsicNode::UL) {
2720     pmovzxbw(vec, Address(str2, 0));
2721   } else {
2722     movdqu(vec, Address(str2, 0));
2723   }
2724   movl(cnt2, int_cnt2);
2725   movptr(result, str1); // string addr
2726 
2727   if (int_cnt2 > stride) {
2728     jmpb(SCAN_TO_SUBSTR);
2729 
2730     // Reload substr for rescan, this code
2731     // is executed only for large substrings (> 8 chars)
2732     bind(RELOAD_SUBSTR);
2733     if (ae == StrIntrinsicNode::UL) {
2734       pmovzxbw(vec, Address(str2, 0));
2735     } else {
2736       movdqu(vec, Address(str2, 0));
2737     }
2738     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2739 
2740     bind(RELOAD_STR);
2741     // We came here after the beginning of the substring was
2742     // matched but the rest of it was not so we need to search
2743     // again. Start from the next element after the previous match.
2744 
2745     // cnt2 is number of substring reminding elements and
2746     // cnt1 is number of string reminding elements when cmp failed.
2747     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2748     subl(cnt1, cnt2);
2749     addl(cnt1, int_cnt2);
2750     movl(cnt2, int_cnt2); // Now restore cnt2
2751 
2752     decrementl(cnt1);     // Shift to next element
2753     cmpl(cnt1, cnt2);
2754     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2755 
2756     addptr(result, (1<<scale1));
2757 
2758   } // (int_cnt2 > 8)
2759 
2760   // Scan string for start of substr in 16-byte vectors
2761   bind(SCAN_TO_SUBSTR);
2762   pcmpestri(vec, Address(result, 0), mode);
2763   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2764   subl(cnt1, stride);
2765   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2766   cmpl(cnt1, cnt2);
2767   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2768   addptr(result, 16);
2769   jmpb(SCAN_TO_SUBSTR);
2770 
2771   // Found a potential substr
2772   bind(FOUND_CANDIDATE);
2773   // Matched whole vector if first element matched (tmp(rcx) == 0).
2774   if (int_cnt2 == stride) {
2775     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2776   } else { // int_cnt2 > 8
2777     jccb(Assembler::overflow, FOUND_SUBSTR);
2778   }
2779   // After pcmpestri tmp(rcx) contains matched element index
2780   // Compute start addr of substr
2781   lea(result, Address(result, tmp, scale1));
2782 
2783   // Make sure string is still long enough
2784   subl(cnt1, tmp);
2785   cmpl(cnt1, cnt2);
2786   if (int_cnt2 == stride) {
2787     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2788   } else { // int_cnt2 > 8
2789     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2790   }
2791   // Left less then substring.
2792 
2793   bind(RET_NOT_FOUND);
2794   movl(result, -1);
2795   jmp(EXIT);
2796 
2797   if (int_cnt2 > stride) {
2798     // This code is optimized for the case when whole substring
2799     // is matched if its head is matched.
2800     bind(MATCH_SUBSTR_HEAD);
2801     pcmpestri(vec, Address(result, 0), mode);
2802     // Reload only string if does not match
2803     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2804 
2805     Label CONT_SCAN_SUBSTR;
2806     // Compare the rest of substring (> 8 chars).
2807     bind(FOUND_SUBSTR);
2808     // First 8 chars are already matched.
2809     negptr(cnt2);
2810     addptr(cnt2, stride);
2811 
2812     bind(SCAN_SUBSTR);
2813     subl(cnt1, stride);
2814     cmpl(cnt2, -stride); // Do not read beyond substring
2815     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2816     // Back-up strings to avoid reading beyond substring:
2817     // cnt1 = cnt1 - cnt2 + 8
2818     addl(cnt1, cnt2); // cnt2 is negative
2819     addl(cnt1, stride);
2820     movl(cnt2, stride); negptr(cnt2);
2821     bind(CONT_SCAN_SUBSTR);
2822     if (int_cnt2 < (int)G) {
2823       int tail_off1 = int_cnt2<<scale1;
2824       int tail_off2 = int_cnt2<<scale2;
2825       if (ae == StrIntrinsicNode::UL) {
2826         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2827       } else {
2828         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2829       }
2830       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2831     } else {
2832       // calculate index in register to avoid integer overflow (int_cnt2*2)
2833       movl(tmp, int_cnt2);
2834       addptr(tmp, cnt2);
2835       if (ae == StrIntrinsicNode::UL) {
2836         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2837       } else {
2838         movdqu(vec, Address(str2, tmp, scale2, 0));
2839       }
2840       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2841     }
2842     // Need to reload strings pointers if not matched whole vector
2843     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2844     addptr(cnt2, stride);
2845     jcc(Assembler::negative, SCAN_SUBSTR);
2846     // Fall through if found full substring
2847 
2848   } // (int_cnt2 > 8)
2849 
2850   bind(RET_FOUND);
2851   // Found result if we matched full small substring.
2852   // Compute substr offset
2853   subptr(result, str1);
2854   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2855     shrl(result, 1); // index
2856   }
2857   bind(EXIT);
2858 
2859 } // string_indexofC8
2860 
2861 // Small strings are loaded through stack if they cross page boundary.
2862 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2863                                        Register cnt1, Register cnt2,
2864                                        int int_cnt2,  Register result,
2865                                        XMMRegister vec, Register tmp,
2866                                        int ae) {
2867   ShortBranchVerifier sbv(this);
2868   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2869   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2870 
2871   //
2872   // int_cnt2 is length of small (< 8 chars) constant substring
2873   // or (-1) for non constant substring in which case its length
2874   // is in cnt2 register.
2875   //
2876   // Note, inline_string_indexOf() generates checks:
2877   // if (substr.count > string.count) return -1;
2878   // if (substr.count == 0) return 0;
2879   //
2880   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2881   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2882   // This method uses the pcmpestri instruction with bound registers
2883   //   inputs:
2884   //     xmm - substring
2885   //     rax - substring length (elements count)
2886   //     mem - scanned string
2887   //     rdx - string length (elements count)
2888   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2889   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2890   //   outputs:
2891   //     rcx - matched index in string
2892   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2893   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2894   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2895   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2896 
2897   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2898         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2899         FOUND_CANDIDATE;
2900 
2901   { //========================================================
2902     // We don't know where these strings are located
2903     // and we can't read beyond them. Load them through stack.
2904     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2905 
2906     movptr(tmp, rsp); // save old SP
2907 
2908     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2909       if (int_cnt2 == (1>>scale2)) { // One byte
2910         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2911         load_unsigned_byte(result, Address(str2, 0));
2912         movdl(vec, result); // move 32 bits
2913       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2914         // Not enough header space in 32-bit VM: 12+3 = 15.
2915         movl(result, Address(str2, -1));
2916         shrl(result, 8);
2917         movdl(vec, result); // move 32 bits
2918       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2919         load_unsigned_short(result, Address(str2, 0));
2920         movdl(vec, result); // move 32 bits
2921       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2922         movdl(vec, Address(str2, 0)); // move 32 bits
2923       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2924         movq(vec, Address(str2, 0));  // move 64 bits
2925       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2926         // Array header size is 12 bytes in 32-bit VM
2927         // + 6 bytes for 3 chars == 18 bytes,
2928         // enough space to load vec and shift.
2929         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2930         if (ae == StrIntrinsicNode::UL) {
2931           int tail_off = int_cnt2-8;
2932           pmovzxbw(vec, Address(str2, tail_off));
2933           psrldq(vec, -2*tail_off);
2934         }
2935         else {
2936           int tail_off = int_cnt2*(1<<scale2);
2937           movdqu(vec, Address(str2, tail_off-16));
2938           psrldq(vec, 16-tail_off);
2939         }
2940       }
2941     } else { // not constant substring
2942       cmpl(cnt2, stride);
2943       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2944 
2945       // We can read beyond string if srt+16 does not cross page boundary
2946       // since heaps are aligned and mapped by pages.
2947       assert(os::vm_page_size() < (int)G, "default page should be small");
2948       movl(result, str2); // We need only low 32 bits
2949       andl(result, ((int)os::vm_page_size()-1));
2950       cmpl(result, ((int)os::vm_page_size()-16));
2951       jccb(Assembler::belowEqual, CHECK_STR);
2952 
2953       // Move small strings to stack to allow load 16 bytes into vec.
2954       subptr(rsp, 16);
2955       int stk_offset = wordSize-(1<<scale2);
2956       push(cnt2);
2957 
2958       bind(COPY_SUBSTR);
2959       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2960         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2961         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2962       } else if (ae == StrIntrinsicNode::UU) {
2963         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2964         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2965       }
2966       decrement(cnt2);
2967       jccb(Assembler::notZero, COPY_SUBSTR);
2968 
2969       pop(cnt2);
2970       movptr(str2, rsp);  // New substring address
2971     } // non constant
2972 
2973     bind(CHECK_STR);
2974     cmpl(cnt1, stride);
2975     jccb(Assembler::aboveEqual, BIG_STRINGS);
2976 
2977     // Check cross page boundary.
2978     movl(result, str1); // We need only low 32 bits
2979     andl(result, ((int)os::vm_page_size()-1));
2980     cmpl(result, ((int)os::vm_page_size()-16));
2981     jccb(Assembler::belowEqual, BIG_STRINGS);
2982 
2983     subptr(rsp, 16);
2984     int stk_offset = -(1<<scale1);
2985     if (int_cnt2 < 0) { // not constant
2986       push(cnt2);
2987       stk_offset += wordSize;
2988     }
2989     movl(cnt2, cnt1);
2990 
2991     bind(COPY_STR);
2992     if (ae == StrIntrinsicNode::LL) {
2993       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2994       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2995     } else {
2996       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2997       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2998     }
2999     decrement(cnt2);
3000     jccb(Assembler::notZero, COPY_STR);
3001 
3002     if (int_cnt2 < 0) { // not constant
3003       pop(cnt2);
3004     }
3005     movptr(str1, rsp);  // New string address
3006 
3007     bind(BIG_STRINGS);
3008     // Load substring.
3009     if (int_cnt2 < 0) { // -1
3010       if (ae == StrIntrinsicNode::UL) {
3011         pmovzxbw(vec, Address(str2, 0));
3012       } else {
3013         movdqu(vec, Address(str2, 0));
3014       }
3015       push(cnt2);       // substr count
3016       push(str2);       // substr addr
3017       push(str1);       // string addr
3018     } else {
3019       // Small (< 8 chars) constant substrings are loaded already.
3020       movl(cnt2, int_cnt2);
3021     }
3022     push(tmp);  // original SP
3023 
3024   } // Finished loading
3025 
3026   //========================================================
3027   // Start search
3028   //
3029 
3030   movptr(result, str1); // string addr
3031 
3032   if (int_cnt2  < 0) {  // Only for non constant substring
3033     jmpb(SCAN_TO_SUBSTR);
3034 
3035     // SP saved at sp+0
3036     // String saved at sp+1*wordSize
3037     // Substr saved at sp+2*wordSize
3038     // Substr count saved at sp+3*wordSize
3039 
3040     // Reload substr for rescan, this code
3041     // is executed only for large substrings (> 8 chars)
3042     bind(RELOAD_SUBSTR);
3043     movptr(str2, Address(rsp, 2*wordSize));
3044     movl(cnt2, Address(rsp, 3*wordSize));
3045     if (ae == StrIntrinsicNode::UL) {
3046       pmovzxbw(vec, Address(str2, 0));
3047     } else {
3048       movdqu(vec, Address(str2, 0));
3049     }
3050     // We came here after the beginning of the substring was
3051     // matched but the rest of it was not so we need to search
3052     // again. Start from the next element after the previous match.
3053     subptr(str1, result); // Restore counter
3054     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3055       shrl(str1, 1);
3056     }
3057     addl(cnt1, str1);
3058     decrementl(cnt1);   // Shift to next element
3059     cmpl(cnt1, cnt2);
3060     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3061 
3062     addptr(result, (1<<scale1));
3063   } // non constant
3064 
3065   // Scan string for start of substr in 16-byte vectors
3066   bind(SCAN_TO_SUBSTR);
3067   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3068   pcmpestri(vec, Address(result, 0), mode);
3069   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3070   subl(cnt1, stride);
3071   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3072   cmpl(cnt1, cnt2);
3073   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3074   addptr(result, 16);
3075 
3076   bind(ADJUST_STR);
3077   cmpl(cnt1, stride); // Do not read beyond string
3078   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3079   // Back-up string to avoid reading beyond string.
3080   lea(result, Address(result, cnt1, scale1, -16));
3081   movl(cnt1, stride);
3082   jmpb(SCAN_TO_SUBSTR);
3083 
3084   // Found a potential substr
3085   bind(FOUND_CANDIDATE);
3086   // After pcmpestri tmp(rcx) contains matched element index
3087 
3088   // Make sure string is still long enough
3089   subl(cnt1, tmp);
3090   cmpl(cnt1, cnt2);
3091   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3092   // Left less then substring.
3093 
3094   bind(RET_NOT_FOUND);
3095   movl(result, -1);
3096   jmp(CLEANUP);
3097 
3098   bind(FOUND_SUBSTR);
3099   // Compute start addr of substr
3100   lea(result, Address(result, tmp, scale1));
3101   if (int_cnt2 > 0) { // Constant substring
3102     // Repeat search for small substring (< 8 chars)
3103     // from new point without reloading substring.
3104     // Have to check that we don't read beyond string.
3105     cmpl(tmp, stride-int_cnt2);
3106     jccb(Assembler::greater, ADJUST_STR);
3107     // Fall through if matched whole substring.
3108   } else { // non constant
3109     assert(int_cnt2 == -1, "should be != 0");
3110 
3111     addl(tmp, cnt2);
3112     // Found result if we matched whole substring.
3113     cmpl(tmp, stride);
3114     jcc(Assembler::lessEqual, RET_FOUND);
3115 
3116     // Repeat search for small substring (<= 8 chars)
3117     // from new point 'str1' without reloading substring.
3118     cmpl(cnt2, stride);
3119     // Have to check that we don't read beyond string.
3120     jccb(Assembler::lessEqual, ADJUST_STR);
3121 
3122     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3123     // Compare the rest of substring (> 8 chars).
3124     movptr(str1, result);
3125 
3126     cmpl(tmp, cnt2);
3127     // First 8 chars are already matched.
3128     jccb(Assembler::equal, CHECK_NEXT);
3129 
3130     bind(SCAN_SUBSTR);
3131     pcmpestri(vec, Address(str1, 0), mode);
3132     // Need to reload strings pointers if not matched whole vector
3133     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3134 
3135     bind(CHECK_NEXT);
3136     subl(cnt2, stride);
3137     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3138     addptr(str1, 16);
3139     if (ae == StrIntrinsicNode::UL) {
3140       addptr(str2, 8);
3141     } else {
3142       addptr(str2, 16);
3143     }
3144     subl(cnt1, stride);
3145     cmpl(cnt2, stride); // Do not read beyond substring
3146     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3147     // Back-up strings to avoid reading beyond substring.
3148 
3149     if (ae == StrIntrinsicNode::UL) {
3150       lea(str2, Address(str2, cnt2, scale2, -8));
3151       lea(str1, Address(str1, cnt2, scale1, -16));
3152     } else {
3153       lea(str2, Address(str2, cnt2, scale2, -16));
3154       lea(str1, Address(str1, cnt2, scale1, -16));
3155     }
3156     subl(cnt1, cnt2);
3157     movl(cnt2, stride);
3158     addl(cnt1, stride);
3159     bind(CONT_SCAN_SUBSTR);
3160     if (ae == StrIntrinsicNode::UL) {
3161       pmovzxbw(vec, Address(str2, 0));
3162     } else {
3163       movdqu(vec, Address(str2, 0));
3164     }
3165     jmp(SCAN_SUBSTR);
3166 
3167     bind(RET_FOUND_LONG);
3168     movptr(str1, Address(rsp, wordSize));
3169   } // non constant
3170 
3171   bind(RET_FOUND);
3172   // Compute substr offset
3173   subptr(result, str1);
3174   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3175     shrl(result, 1); // index
3176   }
3177   bind(CLEANUP);
3178   pop(rsp); // restore SP
3179 
3180 } // string_indexof
3181 
3182 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3183                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3184   ShortBranchVerifier sbv(this);
3185   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3186 
3187   int stride = 8;
3188 
3189   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3190         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3191         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3192         FOUND_SEQ_CHAR, DONE_LABEL;
3193 
3194   movptr(result, str1);
3195   if (UseAVX >= 2) {
3196     cmpl(cnt1, stride);
3197     jcc(Assembler::less, SCAN_TO_CHAR);
3198     cmpl(cnt1, 2*stride);
3199     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3200     movdl(vec1, ch);
3201     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3202     vpxor(vec2, vec2);
3203     movl(tmp, cnt1);
3204     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3205     andl(cnt1,0x0000000F);  //tail count (in chars)
3206 
3207     bind(SCAN_TO_16_CHAR_LOOP);
3208     vmovdqu(vec3, Address(result, 0));
3209     vpcmpeqw(vec3, vec3, vec1, 1);
3210     vptest(vec2, vec3);
3211     jcc(Assembler::carryClear, FOUND_CHAR);
3212     addptr(result, 32);
3213     subl(tmp, 2*stride);
3214     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3215     jmp(SCAN_TO_8_CHAR);
3216     bind(SCAN_TO_8_CHAR_INIT);
3217     movdl(vec1, ch);
3218     pshuflw(vec1, vec1, 0x00);
3219     pshufd(vec1, vec1, 0);
3220     pxor(vec2, vec2);
3221   }
3222   bind(SCAN_TO_8_CHAR);
3223   cmpl(cnt1, stride);
3224   jcc(Assembler::less, SCAN_TO_CHAR);
3225   if (UseAVX < 2) {
3226     movdl(vec1, ch);
3227     pshuflw(vec1, vec1, 0x00);
3228     pshufd(vec1, vec1, 0);
3229     pxor(vec2, vec2);
3230   }
3231   movl(tmp, cnt1);
3232   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3233   andl(cnt1,0x00000007);  //tail count (in chars)
3234 
3235   bind(SCAN_TO_8_CHAR_LOOP);
3236   movdqu(vec3, Address(result, 0));
3237   pcmpeqw(vec3, vec1);
3238   ptest(vec2, vec3);
3239   jcc(Assembler::carryClear, FOUND_CHAR);
3240   addptr(result, 16);
3241   subl(tmp, stride);
3242   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3243   bind(SCAN_TO_CHAR);
3244   testl(cnt1, cnt1);
3245   jcc(Assembler::zero, RET_NOT_FOUND);
3246   bind(SCAN_TO_CHAR_LOOP);
3247   load_unsigned_short(tmp, Address(result, 0));
3248   cmpl(ch, tmp);
3249   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3250   addptr(result, 2);
3251   subl(cnt1, 1);
3252   jccb(Assembler::zero, RET_NOT_FOUND);
3253   jmp(SCAN_TO_CHAR_LOOP);
3254 
3255   bind(RET_NOT_FOUND);
3256   movl(result, -1);
3257   jmpb(DONE_LABEL);
3258 
3259   bind(FOUND_CHAR);
3260   if (UseAVX >= 2) {
3261     vpmovmskb(tmp, vec3);
3262   } else {
3263     pmovmskb(tmp, vec3);
3264   }
3265   bsfl(ch, tmp);
3266   addptr(result, ch);
3267 
3268   bind(FOUND_SEQ_CHAR);
3269   subptr(result, str1);
3270   shrl(result, 1);
3271 
3272   bind(DONE_LABEL);
3273 } // string_indexof_char
3274 
3275 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3276                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3277   ShortBranchVerifier sbv(this);
3278   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3279 
3280   int stride = 16;
3281 
3282   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3283         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3284         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3285         FOUND_SEQ_CHAR, DONE_LABEL;
3286 
3287   movptr(result, str1);
3288   if (UseAVX >= 2) {
3289     cmpl(cnt1, stride);
3290     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3291     cmpl(cnt1, stride*2);
3292     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3293     movdl(vec1, ch);
3294     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3295     vpxor(vec2, vec2);
3296     movl(tmp, cnt1);
3297     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3298     andl(cnt1,0x0000001F);  //tail count (in chars)
3299 
3300     bind(SCAN_TO_32_CHAR_LOOP);
3301     vmovdqu(vec3, Address(result, 0));
3302     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3303     vptest(vec2, vec3);
3304     jcc(Assembler::carryClear, FOUND_CHAR);
3305     addptr(result, 32);
3306     subl(tmp, stride*2);
3307     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3308     jmp(SCAN_TO_16_CHAR);
3309 
3310     bind(SCAN_TO_16_CHAR_INIT);
3311     movdl(vec1, ch);
3312     pxor(vec2, vec2);
3313     pshufb(vec1, vec2);
3314   }
3315 
3316   bind(SCAN_TO_16_CHAR);
3317   cmpl(cnt1, stride);
3318   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3319   if (UseAVX < 2) {
3320     movdl(vec1, ch);
3321     pxor(vec2, vec2);
3322     pshufb(vec1, vec2);
3323   }
3324   movl(tmp, cnt1);
3325   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3326   andl(cnt1,0x0000000F);  //tail count (in bytes)
3327 
3328   bind(SCAN_TO_16_CHAR_LOOP);
3329   movdqu(vec3, Address(result, 0));
3330   pcmpeqb(vec3, vec1);
3331   ptest(vec2, vec3);
3332   jcc(Assembler::carryClear, FOUND_CHAR);
3333   addptr(result, 16);
3334   subl(tmp, stride);
3335   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3336 
3337   bind(SCAN_TO_CHAR_INIT);
3338   testl(cnt1, cnt1);
3339   jcc(Assembler::zero, RET_NOT_FOUND);
3340   bind(SCAN_TO_CHAR_LOOP);
3341   load_unsigned_byte(tmp, Address(result, 0));
3342   cmpl(ch, tmp);
3343   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3344   addptr(result, 1);
3345   subl(cnt1, 1);
3346   jccb(Assembler::zero, RET_NOT_FOUND);
3347   jmp(SCAN_TO_CHAR_LOOP);
3348 
3349   bind(RET_NOT_FOUND);
3350   movl(result, -1);
3351   jmpb(DONE_LABEL);
3352 
3353   bind(FOUND_CHAR);
3354   if (UseAVX >= 2) {
3355     vpmovmskb(tmp, vec3);
3356   } else {
3357     pmovmskb(tmp, vec3);
3358   }
3359   bsfl(ch, tmp);
3360   addptr(result, ch);
3361 
3362   bind(FOUND_SEQ_CHAR);
3363   subptr(result, str1);
3364 
3365   bind(DONE_LABEL);
3366 } // stringL_indexof_char
3367 
3368 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3369   switch (eltype) {
3370   case T_BOOLEAN: return sizeof(jboolean);
3371   case T_BYTE:  return sizeof(jbyte);
3372   case T_SHORT: return sizeof(jshort);
3373   case T_CHAR:  return sizeof(jchar);
3374   case T_INT:   return sizeof(jint);
3375   default:
3376     ShouldNotReachHere();
3377     return -1;
3378   }
3379 }
3380 
3381 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3382   switch (eltype) {
3383   // T_BOOLEAN used as surrogate for unsigned byte
3384   case T_BOOLEAN: movzbl(dst, src);   break;
3385   case T_BYTE:    movsbl(dst, src);   break;
3386   case T_SHORT:   movswl(dst, src);   break;
3387   case T_CHAR:    movzwl(dst, src);   break;
3388   case T_INT:     movl(dst, src);     break;
3389   default:
3390     ShouldNotReachHere();
3391   }
3392 }
3393 
3394 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3395   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3396 }
3397 
3398 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3399   load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3400 }
3401 
3402 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3403   const int vlen = Assembler::AVX_256bit;
3404   switch (eltype) {
3405   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3406   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3407   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3408   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3409   case T_INT:
3410     // do nothing
3411     break;
3412   default:
3413     ShouldNotReachHere();
3414   }
3415 }
3416 
3417 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3418                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3419                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3420                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3421                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3422                                         BasicType eltype) {
3423   ShortBranchVerifier sbv(this);
3424   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3425   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3426   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3427 
3428   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3429         SHORT_UNROLLED_LOOP_EXIT,
3430         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3431         UNROLLED_VECTOR_LOOP_BEGIN,
3432         END;
3433   switch (eltype) {
3434   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3435   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3436   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3437   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3438   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3439   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3440   }
3441 
3442   // For "renaming" for readibility of the code
3443   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3444                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3445                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3446 
3447   const int elsize = arrays_hashcode_elsize(eltype);
3448 
3449   /*
3450     if (cnt1 >= 2) {
3451       if (cnt1 >= 32) {
3452         UNROLLED VECTOR LOOP
3453       }
3454       UNROLLED SCALAR LOOP
3455     }
3456     SINGLE SCALAR
3457    */
3458 
3459   cmpl(cnt1, 32);
3460   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3461 
3462   // cnt1 >= 32 && generate_vectorized_loop
3463   xorl(index, index);
3464 
3465   // vresult = IntVector.zero(I256);
3466   for (int idx = 0; idx < 4; idx++) {
3467     vpxor(vresult[idx], vresult[idx]);
3468   }
3469   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3470   Register bound = tmp2;
3471   Register next = tmp3;
3472   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3473   movl(next, Address(tmp2, 0));
3474   movdl(vnext, next);
3475   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3476 
3477   // index = 0;
3478   // bound = cnt1 & ~(32 - 1);
3479   movl(bound, cnt1);
3480   andl(bound, ~(32 - 1));
3481   // for (; index < bound; index += 32) {
3482   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3483   // result *= next;
3484   imull(result, next);
3485   // loop fission to upfront the cost of fetching from memory, OOO execution
3486   // can then hopefully do a better job of prefetching
3487   for (int idx = 0; idx < 4; idx++) {
3488     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3489   }
3490   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3491   for (int idx = 0; idx < 4; idx++) {
3492     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3493     arrays_hashcode_elvcast(vtmp[idx], eltype);
3494     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3495   }
3496   // index += 32;
3497   addl(index, 32);
3498   // index < bound;
3499   cmpl(index, bound);
3500   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3501   // }
3502 
3503   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3504   subl(cnt1, bound);
3505   // release bound
3506 
3507   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3508   for (int idx = 0; idx < 4; idx++) {
3509     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3510     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3511     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3512   }
3513   // result += vresult.reduceLanes(ADD);
3514   for (int idx = 0; idx < 4; idx++) {
3515     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3516   }
3517 
3518   // } else if (cnt1 < 32) {
3519 
3520   bind(SHORT_UNROLLED_BEGIN);
3521   // int i = 1;
3522   movl(index, 1);
3523   cmpl(index, cnt1);
3524   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3525 
3526   // for (; i < cnt1 ; i += 2) {
3527   bind(SHORT_UNROLLED_LOOP_BEGIN);
3528   movl(tmp3, 961);
3529   imull(result, tmp3);
3530   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3531   movl(tmp3, tmp2);
3532   shll(tmp3, 5);
3533   subl(tmp3, tmp2);
3534   addl(result, tmp3);
3535   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3536   addl(result, tmp3);
3537   addl(index, 2);
3538   cmpl(index, cnt1);
3539   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3540 
3541   // }
3542   // if (i >= cnt1) {
3543   bind(SHORT_UNROLLED_LOOP_EXIT);
3544   jccb(Assembler::greater, END);
3545   movl(tmp2, result);
3546   shll(result, 5);
3547   subl(result, tmp2);
3548   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3549   addl(result, tmp3);
3550   // }
3551   bind(END);
3552 
3553   BLOCK_COMMENT("} // arrays_hashcode");
3554 
3555 } // arrays_hashcode
3556 
3557 // helper function for string_compare
3558 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3559                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3560                                            Address::ScaleFactor scale2, Register index, int ae) {
3561   if (ae == StrIntrinsicNode::LL) {
3562     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3563     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3564   } else if (ae == StrIntrinsicNode::UU) {
3565     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3566     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3567   } else {
3568     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3569     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3570   }
3571 }
3572 
3573 // Compare strings, used for char[] and byte[].
3574 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3575                                        Register cnt1, Register cnt2, Register result,
3576                                        XMMRegister vec1, int ae, KRegister mask) {
3577   ShortBranchVerifier sbv(this);
3578   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3579   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only AVX3
3580   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3581   int stride2x2 = 0x40;
3582   Address::ScaleFactor scale = Address::no_scale;
3583   Address::ScaleFactor scale1 = Address::no_scale;
3584   Address::ScaleFactor scale2 = Address::no_scale;
3585 
3586   if (ae != StrIntrinsicNode::LL) {
3587     stride2x2 = 0x20;
3588   }
3589 
3590   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3591     shrl(cnt2, 1);
3592   }
3593   // Compute the minimum of the string lengths and the
3594   // difference of the string lengths (stack).
3595   // Do the conditional move stuff
3596   movl(result, cnt1);
3597   subl(cnt1, cnt2);
3598   push(cnt1);
3599   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3600 
3601   // Is the minimum length zero?
3602   testl(cnt2, cnt2);
3603   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3604   if (ae == StrIntrinsicNode::LL) {
3605     // Load first bytes
3606     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3607     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3608   } else if (ae == StrIntrinsicNode::UU) {
3609     // Load first characters
3610     load_unsigned_short(result, Address(str1, 0));
3611     load_unsigned_short(cnt1, Address(str2, 0));
3612   } else {
3613     load_unsigned_byte(result, Address(str1, 0));
3614     load_unsigned_short(cnt1, Address(str2, 0));
3615   }
3616   subl(result, cnt1);
3617   jcc(Assembler::notZero,  POP_LABEL);
3618 
3619   if (ae == StrIntrinsicNode::UU) {
3620     // Divide length by 2 to get number of chars
3621     shrl(cnt2, 1);
3622   }
3623   cmpl(cnt2, 1);
3624   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3625 
3626   // Check if the strings start at the same location and setup scale and stride
3627   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3628     cmpptr(str1, str2);
3629     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3630     if (ae == StrIntrinsicNode::LL) {
3631       scale = Address::times_1;
3632       stride = 16;
3633     } else {
3634       scale = Address::times_2;
3635       stride = 8;
3636     }
3637   } else {
3638     scale1 = Address::times_1;
3639     scale2 = Address::times_2;
3640     // scale not used
3641     stride = 8;
3642   }
3643 
3644   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3645     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3646     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3647     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3648     Label COMPARE_TAIL_LONG;
3649     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only AVX3
3650 
3651     int pcmpmask = 0x19;
3652     if (ae == StrIntrinsicNode::LL) {
3653       pcmpmask &= ~0x01;
3654     }
3655 
3656     // Setup to compare 16-chars (32-bytes) vectors,
3657     // start from first character again because it has aligned address.
3658     if (ae == StrIntrinsicNode::LL) {
3659       stride2 = 32;
3660     } else {
3661       stride2 = 16;
3662     }
3663     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3664       adr_stride = stride << scale;
3665     } else {
3666       adr_stride1 = 8;  //stride << scale1;
3667       adr_stride2 = 16; //stride << scale2;
3668     }
3669 
3670     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3671     // rax and rdx are used by pcmpestri as elements counters
3672     movl(result, cnt2);
3673     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3674     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3675 
3676     // fast path : compare first 2 8-char vectors.
3677     bind(COMPARE_16_CHARS);
3678     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3679       movdqu(vec1, Address(str1, 0));
3680     } else {
3681       pmovzxbw(vec1, Address(str1, 0));
3682     }
3683     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3684     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3685 
3686     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3687       movdqu(vec1, Address(str1, adr_stride));
3688       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3689     } else {
3690       pmovzxbw(vec1, Address(str1, adr_stride1));
3691       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3692     }
3693     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3694     addl(cnt1, stride);
3695 
3696     // Compare the characters at index in cnt1
3697     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3698     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3699     subl(result, cnt2);
3700     jmp(POP_LABEL);
3701 
3702     // Setup the registers to start vector comparison loop
3703     bind(COMPARE_WIDE_VECTORS);
3704     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3705       lea(str1, Address(str1, result, scale));
3706       lea(str2, Address(str2, result, scale));
3707     } else {
3708       lea(str1, Address(str1, result, scale1));
3709       lea(str2, Address(str2, result, scale2));
3710     }
3711     subl(result, stride2);
3712     subl(cnt2, stride2);
3713     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3714     negptr(result);
3715 
3716     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3717     bind(COMPARE_WIDE_VECTORS_LOOP);
3718 
3719     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3720       cmpl(cnt2, stride2x2);
3721       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3722       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3723       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3724 
3725       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3726       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3727         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3728         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3729       } else {
3730         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3731         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3732       }
3733       kortestql(mask, mask);
3734       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3735       addptr(result, stride2x2);  // update since we already compared at this addr
3736       subl(cnt2, stride2x2);      // and sub the size too
3737       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3738 
3739       vpxor(vec1, vec1);
3740       jmpb(COMPARE_WIDE_TAIL);
3741     }//if (VM_Version::supports_avx512vlbw())
3742 
3743     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3744     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3745       vmovdqu(vec1, Address(str1, result, scale));
3746       vpxor(vec1, Address(str2, result, scale));
3747     } else {
3748       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3749       vpxor(vec1, Address(str2, result, scale2));
3750     }
3751     vptest(vec1, vec1);
3752     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3753     addptr(result, stride2);
3754     subl(cnt2, stride2);
3755     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3756     // clean upper bits of YMM registers
3757     vpxor(vec1, vec1);
3758 
3759     // compare wide vectors tail
3760     bind(COMPARE_WIDE_TAIL);
3761     testptr(result, result);
3762     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3763 
3764     movl(result, stride2);
3765     movl(cnt2, result);
3766     negptr(result);
3767     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3768 
3769     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3770     bind(VECTOR_NOT_EQUAL);
3771     // clean upper bits of YMM registers
3772     vpxor(vec1, vec1);
3773     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3774       lea(str1, Address(str1, result, scale));
3775       lea(str2, Address(str2, result, scale));
3776     } else {
3777       lea(str1, Address(str1, result, scale1));
3778       lea(str2, Address(str2, result, scale2));
3779     }
3780     jmp(COMPARE_16_CHARS);
3781 
3782     // Compare tail chars, length between 1 to 15 chars
3783     bind(COMPARE_TAIL_LONG);
3784     movl(cnt2, result);
3785     cmpl(cnt2, stride);
3786     jcc(Assembler::less, COMPARE_SMALL_STR);
3787 
3788     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3789       movdqu(vec1, Address(str1, 0));
3790     } else {
3791       pmovzxbw(vec1, Address(str1, 0));
3792     }
3793     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3794     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3795     subptr(cnt2, stride);
3796     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3797     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3798       lea(str1, Address(str1, result, scale));
3799       lea(str2, Address(str2, result, scale));
3800     } else {
3801       lea(str1, Address(str1, result, scale1));
3802       lea(str2, Address(str2, result, scale2));
3803     }
3804     negptr(cnt2);
3805     jmpb(WHILE_HEAD_LABEL);
3806 
3807     bind(COMPARE_SMALL_STR);
3808   } else if (UseSSE42Intrinsics) {
3809     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3810     int pcmpmask = 0x19;
3811     // Setup to compare 8-char (16-byte) vectors,
3812     // start from first character again because it has aligned address.
3813     movl(result, cnt2);
3814     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3815     if (ae == StrIntrinsicNode::LL) {
3816       pcmpmask &= ~0x01;
3817     }
3818     jcc(Assembler::zero, COMPARE_TAIL);
3819     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3820       lea(str1, Address(str1, result, scale));
3821       lea(str2, Address(str2, result, scale));
3822     } else {
3823       lea(str1, Address(str1, result, scale1));
3824       lea(str2, Address(str2, result, scale2));
3825     }
3826     negptr(result);
3827 
3828     // pcmpestri
3829     //   inputs:
3830     //     vec1- substring
3831     //     rax - negative string length (elements count)
3832     //     mem - scanned string
3833     //     rdx - string length (elements count)
3834     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3835     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3836     //   outputs:
3837     //     rcx - first mismatched element index
3838     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3839 
3840     bind(COMPARE_WIDE_VECTORS);
3841     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3842       movdqu(vec1, Address(str1, result, scale));
3843       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3844     } else {
3845       pmovzxbw(vec1, Address(str1, result, scale1));
3846       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3847     }
3848     // After pcmpestri cnt1(rcx) contains mismatched element index
3849 
3850     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3851     addptr(result, stride);
3852     subptr(cnt2, stride);
3853     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3854 
3855     // compare wide vectors tail
3856     testptr(result, result);
3857     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3858 
3859     movl(cnt2, stride);
3860     movl(result, stride);
3861     negptr(result);
3862     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3863       movdqu(vec1, Address(str1, result, scale));
3864       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3865     } else {
3866       pmovzxbw(vec1, Address(str1, result, scale1));
3867       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3868     }
3869     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3870 
3871     // Mismatched characters in the vectors
3872     bind(VECTOR_NOT_EQUAL);
3873     addptr(cnt1, result);
3874     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3875     subl(result, cnt2);
3876     jmpb(POP_LABEL);
3877 
3878     bind(COMPARE_TAIL); // limit is zero
3879     movl(cnt2, result);
3880     // Fallthru to tail compare
3881   }
3882   // Shift str2 and str1 to the end of the arrays, negate min
3883   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3884     lea(str1, Address(str1, cnt2, scale));
3885     lea(str2, Address(str2, cnt2, scale));
3886   } else {
3887     lea(str1, Address(str1, cnt2, scale1));
3888     lea(str2, Address(str2, cnt2, scale2));
3889   }
3890   decrementl(cnt2);  // first character was compared already
3891   negptr(cnt2);
3892 
3893   // Compare the rest of the elements
3894   bind(WHILE_HEAD_LABEL);
3895   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3896   subl(result, cnt1);
3897   jccb(Assembler::notZero, POP_LABEL);
3898   increment(cnt2);
3899   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3900 
3901   // Strings are equal up to min length.  Return the length difference.
3902   bind(LENGTH_DIFF_LABEL);
3903   pop(result);
3904   if (ae == StrIntrinsicNode::UU) {
3905     // Divide diff by 2 to get number of chars
3906     sarl(result, 1);
3907   }
3908   jmpb(DONE_LABEL);
3909 
3910   if (VM_Version::supports_avx512vlbw()) {
3911 
3912     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3913 
3914     kmovql(cnt1, mask);
3915     notq(cnt1);
3916     bsfq(cnt2, cnt1);
3917     if (ae != StrIntrinsicNode::LL) {
3918       // Divide diff by 2 to get number of chars
3919       sarl(cnt2, 1);
3920     }
3921     addq(result, cnt2);
3922     if (ae == StrIntrinsicNode::LL) {
3923       load_unsigned_byte(cnt1, Address(str2, result));
3924       load_unsigned_byte(result, Address(str1, result));
3925     } else if (ae == StrIntrinsicNode::UU) {
3926       load_unsigned_short(cnt1, Address(str2, result, scale));
3927       load_unsigned_short(result, Address(str1, result, scale));
3928     } else {
3929       load_unsigned_short(cnt1, Address(str2, result, scale2));
3930       load_unsigned_byte(result, Address(str1, result, scale1));
3931     }
3932     subl(result, cnt1);
3933     jmpb(POP_LABEL);
3934   }//if (VM_Version::supports_avx512vlbw())
3935 
3936   // Discard the stored length difference
3937   bind(POP_LABEL);
3938   pop(cnt1);
3939 
3940   // That's it
3941   bind(DONE_LABEL);
3942   if(ae == StrIntrinsicNode::UL) {
3943     negl(result);
3944   }
3945 
3946 }
3947 
3948 // Search for Non-ASCII character (Negative byte value) in a byte array,
3949 // return the index of the first such character, otherwise the length
3950 // of the array segment searched.
3951 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3952 //   @IntrinsicCandidate
3953 //   public static int countPositives(byte[] ba, int off, int len) {
3954 //     for (int i = off; i < off + len; i++) {
3955 //       if (ba[i] < 0) {
3956 //         return i - off;
3957 //       }
3958 //     }
3959 //     return len;
3960 //   }
3961 void C2_MacroAssembler::count_positives(Register ary1, Register len,
3962   Register result, Register tmp1,
3963   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3964   // rsi: byte array
3965   // rcx: len
3966   // rax: result
3967   ShortBranchVerifier sbv(this);
3968   assert_different_registers(ary1, len, result, tmp1);
3969   assert_different_registers(vec1, vec2);
3970   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3971 
3972   movl(result, len); // copy
3973   // len == 0
3974   testl(len, len);
3975   jcc(Assembler::zero, DONE);
3976 
3977   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3978     VM_Version::supports_avx512vlbw() &&
3979     VM_Version::supports_bmi2()) {
3980 
3981     Label test_64_loop, test_tail, BREAK_LOOP;
3982     movl(tmp1, len);
3983     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3984 
3985     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
3986     andl(len,  0xffffffc0); // vector count (in chars)
3987     jccb(Assembler::zero, test_tail);
3988 
3989     lea(ary1, Address(ary1, len, Address::times_1));
3990     negptr(len);
3991 
3992     bind(test_64_loop);
3993     // Check whether our 64 elements of size byte contain negatives
3994     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3995     kortestql(mask1, mask1);
3996     jcc(Assembler::notZero, BREAK_LOOP);
3997 
3998     addptr(len, 64);
3999     jccb(Assembler::notZero, test_64_loop);
4000 
4001     bind(test_tail);
4002     // bail out when there is nothing to be done
4003     testl(tmp1, -1);
4004     jcc(Assembler::zero, DONE);
4005 
4006 
4007     // check the tail for absense of negatives
4008     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4009     {
4010       Register tmp3_aliased = len;
4011       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4012       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4013       notq(tmp3_aliased);
4014       kmovql(mask2, tmp3_aliased);
4015     }
4016 
4017     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4018     ktestq(mask1, mask2);
4019     jcc(Assembler::zero, DONE);
4020 
4021     // do a full check for negative registers in the tail
4022     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4023                      // ary1 already pointing to the right place
4024     jmpb(TAIL_START);
4025 
4026     bind(BREAK_LOOP);
4027     // At least one byte in the last 64 byte block was negative.
4028     // Set up to look at the last 64 bytes as if they were a tail
4029     lea(ary1, Address(ary1, len, Address::times_1));
4030     addptr(result, len);
4031     // Ignore the very last byte: if all others are positive,
4032     // it must be negative, so we can skip right to the 2+1 byte
4033     // end comparison at this point
4034     orl(result, 63);
4035     movl(len, 63);
4036     // Fallthru to tail compare
4037   } else {
4038 
4039     if (UseAVX >= 2) {
4040       // With AVX2, use 32-byte vector compare
4041       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4042 
4043       // Compare 32-byte vectors
4044       testl(len, 0xffffffe0);   // vector count (in bytes)
4045       jccb(Assembler::zero, TAIL_START);
4046 
4047       andl(len, 0xffffffe0);
4048       lea(ary1, Address(ary1, len, Address::times_1));
4049       negptr(len);
4050 
4051       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4052       movdl(vec2, tmp1);
4053       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4054 
4055       bind(COMPARE_WIDE_VECTORS);
4056       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4057       vptest(vec1, vec2);
4058       jccb(Assembler::notZero, BREAK_LOOP);
4059       addptr(len, 32);
4060       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4061 
4062       testl(result, 0x0000001f);   // any bytes remaining?
4063       jcc(Assembler::zero, DONE);
4064 
4065       // Quick test using the already prepared vector mask
4066       movl(len, result);
4067       andl(len, 0x0000001f);
4068       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4069       vptest(vec1, vec2);
4070       jcc(Assembler::zero, DONE);
4071       // There are zeros, jump to the tail to determine exactly where
4072       jmpb(TAIL_START);
4073 
4074       bind(BREAK_LOOP);
4075       // At least one byte in the last 32-byte vector is negative.
4076       // Set up to look at the last 32 bytes as if they were a tail
4077       lea(ary1, Address(ary1, len, Address::times_1));
4078       addptr(result, len);
4079       // Ignore the very last byte: if all others are positive,
4080       // it must be negative, so we can skip right to the 2+1 byte
4081       // end comparison at this point
4082       orl(result, 31);
4083       movl(len, 31);
4084       // Fallthru to tail compare
4085     } else if (UseSSE42Intrinsics) {
4086       // With SSE4.2, use double quad vector compare
4087       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4088 
4089       // Compare 16-byte vectors
4090       testl(len, 0xfffffff0);   // vector count (in bytes)
4091       jcc(Assembler::zero, TAIL_START);
4092 
4093       andl(len, 0xfffffff0);
4094       lea(ary1, Address(ary1, len, Address::times_1));
4095       negptr(len);
4096 
4097       movl(tmp1, 0x80808080);
4098       movdl(vec2, tmp1);
4099       pshufd(vec2, vec2, 0);
4100 
4101       bind(COMPARE_WIDE_VECTORS);
4102       movdqu(vec1, Address(ary1, len, Address::times_1));
4103       ptest(vec1, vec2);
4104       jccb(Assembler::notZero, BREAK_LOOP);
4105       addptr(len, 16);
4106       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4107 
4108       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4109       jcc(Assembler::zero, DONE);
4110 
4111       // Quick test using the already prepared vector mask
4112       movl(len, result);
4113       andl(len, 0x0000000f);   // tail count (in bytes)
4114       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4115       ptest(vec1, vec2);
4116       jcc(Assembler::zero, DONE);
4117       jmpb(TAIL_START);
4118 
4119       bind(BREAK_LOOP);
4120       // At least one byte in the last 16-byte vector is negative.
4121       // Set up and look at the last 16 bytes as if they were a tail
4122       lea(ary1, Address(ary1, len, Address::times_1));
4123       addptr(result, len);
4124       // Ignore the very last byte: if all others are positive,
4125       // it must be negative, so we can skip right to the 2+1 byte
4126       // end comparison at this point
4127       orl(result, 15);
4128       movl(len, 15);
4129       // Fallthru to tail compare
4130     }
4131   }
4132 
4133   bind(TAIL_START);
4134   // Compare 4-byte vectors
4135   andl(len, 0xfffffffc); // vector count (in bytes)
4136   jccb(Assembler::zero, COMPARE_CHAR);
4137 
4138   lea(ary1, Address(ary1, len, Address::times_1));
4139   negptr(len);
4140 
4141   bind(COMPARE_VECTORS);
4142   movl(tmp1, Address(ary1, len, Address::times_1));
4143   andl(tmp1, 0x80808080);
4144   jccb(Assembler::notZero, TAIL_ADJUST);
4145   addptr(len, 4);
4146   jccb(Assembler::notZero, COMPARE_VECTORS);
4147 
4148   // Compare trailing char (final 2-3 bytes), if any
4149   bind(COMPARE_CHAR);
4150 
4151   testl(result, 0x2);   // tail  char
4152   jccb(Assembler::zero, COMPARE_BYTE);
4153   load_unsigned_short(tmp1, Address(ary1, 0));
4154   andl(tmp1, 0x00008080);
4155   jccb(Assembler::notZero, CHAR_ADJUST);
4156   lea(ary1, Address(ary1, 2));
4157 
4158   bind(COMPARE_BYTE);
4159   testl(result, 0x1);   // tail  byte
4160   jccb(Assembler::zero, DONE);
4161   load_unsigned_byte(tmp1, Address(ary1, 0));
4162   testl(tmp1, 0x00000080);
4163   jccb(Assembler::zero, DONE);
4164   subptr(result, 1);
4165   jmpb(DONE);
4166 
4167   bind(TAIL_ADJUST);
4168   // there are negative bits in the last 4 byte block.
4169   // Adjust result and check the next three bytes
4170   addptr(result, len);
4171   orl(result, 3);
4172   lea(ary1, Address(ary1, len, Address::times_1));
4173   jmpb(COMPARE_CHAR);
4174 
4175   bind(CHAR_ADJUST);
4176   // We are looking at a char + optional byte tail, and found that one
4177   // of the bytes in the char is negative. Adjust the result, check the
4178   // first byte and readjust if needed.
4179   andl(result, 0xfffffffc);
4180   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4181   jccb(Assembler::notZero, DONE);
4182   addptr(result, 1);
4183 
4184   // That's it
4185   bind(DONE);
4186   if (UseAVX >= 2) {
4187     // clean upper bits of YMM registers
4188     vpxor(vec1, vec1);
4189     vpxor(vec2, vec2);
4190   }
4191 }
4192 
4193 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4194 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4195                                       Register limit, Register result, Register chr,
4196                                       XMMRegister vec1, XMMRegister vec2, bool is_char,
4197                                       KRegister mask, bool expand_ary2) {
4198   // for expand_ary2, limit is the (smaller) size of the second array.
4199   ShortBranchVerifier sbv(this);
4200   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4201 
4202   assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4203          "Expansion only implemented for AVX2");
4204 
4205   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4206   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4207 
4208   Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4209   int scaleIncr = expand_ary2 ? 8 : 16;
4210 
4211   if (is_array_equ) {
4212     // Check the input args
4213     cmpoop(ary1, ary2);
4214     jcc(Assembler::equal, TRUE_LABEL);
4215 
4216     // Need additional checks for arrays_equals.
4217     testptr(ary1, ary1);
4218     jcc(Assembler::zero, FALSE_LABEL);
4219     testptr(ary2, ary2);
4220     jcc(Assembler::zero, FALSE_LABEL);
4221 
4222     // Check the lengths
4223     movl(limit, Address(ary1, length_offset));
4224     cmpl(limit, Address(ary2, length_offset));
4225     jcc(Assembler::notEqual, FALSE_LABEL);
4226   }
4227 
4228   // count == 0
4229   testl(limit, limit);
4230   jcc(Assembler::zero, TRUE_LABEL);
4231 
4232   if (is_array_equ) {
4233     // Load array address
4234     lea(ary1, Address(ary1, base_offset));
4235     lea(ary2, Address(ary2, base_offset));
4236   }
4237 
4238   if (is_array_equ && is_char) {
4239     // arrays_equals when used for char[].
4240     shll(limit, 1);      // byte count != 0
4241   }
4242   movl(result, limit); // copy
4243 
4244   if (UseAVX >= 2) {
4245     // With AVX2, use 32-byte vector compare
4246     Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4247 
4248     // Compare 32-byte vectors
4249     if (expand_ary2) {
4250       andl(result, 0x0000000f);  //   tail count (in bytes)
4251       andl(limit, 0xfffffff0);   // vector count (in bytes)
4252       jcc(Assembler::zero, COMPARE_TAIL);
4253     } else {
4254       andl(result, 0x0000001f);  //   tail count (in bytes)
4255       andl(limit, 0xffffffe0);   // vector count (in bytes)
4256       jcc(Assembler::zero, COMPARE_TAIL_16);
4257     }
4258 
4259     lea(ary1, Address(ary1, limit, scaleFactor));
4260     lea(ary2, Address(ary2, limit, Address::times_1));
4261     negptr(limit);
4262 
4263     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4264       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4265 
4266       cmpl(limit, -64);
4267       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4268 
4269       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4270 
4271       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4272       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4273       kortestql(mask, mask);
4274       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4275       addptr(limit, 64);  // update since we already compared at this addr
4276       cmpl(limit, -64);
4277       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4278 
4279       // At this point we may still need to compare -limit+result bytes.
4280       // We could execute the next two instruction and just continue via non-wide path:
4281       //  cmpl(limit, 0);
4282       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4283       // But since we stopped at the points ary{1,2}+limit which are
4284       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4285       // (|limit| <= 32 and result < 32),
4286       // we may just compare the last 64 bytes.
4287       //
4288       addptr(result, -64);   // it is safe, bc we just came from this area
4289       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4290       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4291       kortestql(mask, mask);
4292       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4293 
4294       jmp(TRUE_LABEL);
4295 
4296       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4297 
4298     }//if (VM_Version::supports_avx512vlbw())
4299 
4300     bind(COMPARE_WIDE_VECTORS);
4301     vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4302     if (expand_ary2) {
4303       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4304     } else {
4305       vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4306     }
4307     vpxor(vec1, vec2);
4308 
4309     vptest(vec1, vec1);
4310     jcc(Assembler::notZero, FALSE_LABEL);
4311     addptr(limit, scaleIncr * 2);
4312     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4313 
4314     testl(result, result);
4315     jcc(Assembler::zero, TRUE_LABEL);
4316 
4317     vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4318     if (expand_ary2) {
4319       vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4320     } else {
4321       vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4322     }
4323     vpxor(vec1, vec2);
4324 
4325     vptest(vec1, vec1);
4326     jcc(Assembler::notZero, FALSE_LABEL);
4327     jmp(TRUE_LABEL);
4328 
4329     bind(COMPARE_TAIL_16); // limit is zero
4330     movl(limit, result);
4331 
4332     // Compare 16-byte chunks
4333     andl(result, 0x0000000f);  //   tail count (in bytes)
4334     andl(limit, 0xfffffff0);   // vector count (in bytes)
4335     jcc(Assembler::zero, COMPARE_TAIL);
4336 
4337     lea(ary1, Address(ary1, limit, scaleFactor));
4338     lea(ary2, Address(ary2, limit, Address::times_1));
4339     negptr(limit);
4340 
4341     bind(COMPARE_WIDE_VECTORS_16);
4342     movdqu(vec1, Address(ary1, limit, scaleFactor));
4343     if (expand_ary2) {
4344       vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4345     } else {
4346       movdqu(vec2, Address(ary2, limit, Address::times_1));
4347     }
4348     pxor(vec1, vec2);
4349 
4350     ptest(vec1, vec1);
4351     jcc(Assembler::notZero, FALSE_LABEL);
4352     addptr(limit, scaleIncr);
4353     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4354 
4355     bind(COMPARE_TAIL); // limit is zero
4356     movl(limit, result);
4357     // Fallthru to tail compare
4358   } else if (UseSSE42Intrinsics) {
4359     // With SSE4.2, use double quad vector compare
4360     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4361 
4362     // Compare 16-byte vectors
4363     andl(result, 0x0000000f);  //   tail count (in bytes)
4364     andl(limit, 0xfffffff0);   // vector count (in bytes)
4365     jcc(Assembler::zero, COMPARE_TAIL);
4366 
4367     lea(ary1, Address(ary1, limit, Address::times_1));
4368     lea(ary2, Address(ary2, limit, Address::times_1));
4369     negptr(limit);
4370 
4371     bind(COMPARE_WIDE_VECTORS);
4372     movdqu(vec1, Address(ary1, limit, Address::times_1));
4373     movdqu(vec2, Address(ary2, limit, Address::times_1));
4374     pxor(vec1, vec2);
4375 
4376     ptest(vec1, vec1);
4377     jcc(Assembler::notZero, FALSE_LABEL);
4378     addptr(limit, 16);
4379     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4380 
4381     testl(result, result);
4382     jcc(Assembler::zero, TRUE_LABEL);
4383 
4384     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4385     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4386     pxor(vec1, vec2);
4387 
4388     ptest(vec1, vec1);
4389     jccb(Assembler::notZero, FALSE_LABEL);
4390     jmpb(TRUE_LABEL);
4391 
4392     bind(COMPARE_TAIL); // limit is zero
4393     movl(limit, result);
4394     // Fallthru to tail compare
4395   }
4396 
4397   // Compare 4-byte vectors
4398   if (expand_ary2) {
4399     testl(result, result);
4400     jccb(Assembler::zero, TRUE_LABEL);
4401   } else {
4402     andl(limit, 0xfffffffc); // vector count (in bytes)
4403     jccb(Assembler::zero, COMPARE_CHAR);
4404   }
4405 
4406   lea(ary1, Address(ary1, limit, scaleFactor));
4407   lea(ary2, Address(ary2, limit, Address::times_1));
4408   negptr(limit);
4409 
4410   bind(COMPARE_VECTORS);
4411   if (expand_ary2) {
4412     // There are no "vector" operations for bytes to shorts
4413     movzbl(chr, Address(ary2, limit, Address::times_1));
4414     cmpw(Address(ary1, limit, Address::times_2), chr);
4415     jccb(Assembler::notEqual, FALSE_LABEL);
4416     addptr(limit, 1);
4417     jcc(Assembler::notZero, COMPARE_VECTORS);
4418     jmp(TRUE_LABEL);
4419   } else {
4420     movl(chr, Address(ary1, limit, Address::times_1));
4421     cmpl(chr, Address(ary2, limit, Address::times_1));
4422     jccb(Assembler::notEqual, FALSE_LABEL);
4423     addptr(limit, 4);
4424     jcc(Assembler::notZero, COMPARE_VECTORS);
4425   }
4426 
4427   // Compare trailing char (final 2 bytes), if any
4428   bind(COMPARE_CHAR);
4429   testl(result, 0x2);   // tail  char
4430   jccb(Assembler::zero, COMPARE_BYTE);
4431   load_unsigned_short(chr, Address(ary1, 0));
4432   load_unsigned_short(limit, Address(ary2, 0));
4433   cmpl(chr, limit);
4434   jccb(Assembler::notEqual, FALSE_LABEL);
4435 
4436   if (is_array_equ && is_char) {
4437     bind(COMPARE_BYTE);
4438   } else {
4439     lea(ary1, Address(ary1, 2));
4440     lea(ary2, Address(ary2, 2));
4441 
4442     bind(COMPARE_BYTE);
4443     testl(result, 0x1);   // tail  byte
4444     jccb(Assembler::zero, TRUE_LABEL);
4445     load_unsigned_byte(chr, Address(ary1, 0));
4446     load_unsigned_byte(limit, Address(ary2, 0));
4447     cmpl(chr, limit);
4448     jccb(Assembler::notEqual, FALSE_LABEL);
4449   }
4450   bind(TRUE_LABEL);
4451   movl(result, 1);   // return true
4452   jmpb(DONE);
4453 
4454   bind(FALSE_LABEL);
4455   xorl(result, result); // return false
4456 
4457   // That's it
4458   bind(DONE);
4459   if (UseAVX >= 2) {
4460     // clean upper bits of YMM registers
4461     vpxor(vec1, vec1);
4462     vpxor(vec2, vec2);
4463   }
4464 }
4465 
4466 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4467 #define __ masm.
4468   Register dst = stub.data<0>();
4469   XMMRegister src = stub.data<1>();
4470   address target = stub.data<2>();
4471   __ bind(stub.entry());
4472   __ subptr(rsp, 8);
4473   __ movdbl(Address(rsp), src);
4474   __ call(RuntimeAddress(target));
4475   // APX REX2 encoding for pop(dst) increases the stub size by 1 byte.
4476   __ pop(dst);
4477   __ jmp(stub.continuation());
4478 #undef __
4479 }
4480 
4481 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4482   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4483   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4484 
4485   address slowpath_target;
4486   if (dst_bt == T_INT) {
4487     if (src_bt == T_FLOAT) {
4488       cvttss2sil(dst, src);
4489       cmpl(dst, 0x80000000);
4490       slowpath_target = StubRoutines::x86::f2i_fixup();
4491     } else {
4492       cvttsd2sil(dst, src);
4493       cmpl(dst, 0x80000000);
4494       slowpath_target = StubRoutines::x86::d2i_fixup();
4495     }
4496   } else {
4497     if (src_bt == T_FLOAT) {
4498       cvttss2siq(dst, src);
4499       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4500       slowpath_target = StubRoutines::x86::f2l_fixup();
4501     } else {
4502       cvttsd2siq(dst, src);
4503       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4504       slowpath_target = StubRoutines::x86::d2l_fixup();
4505     }
4506   }
4507 
4508   // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte.
4509   int max_size = 23 + (UseAPX ? 1 : 0);
4510   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath);
4511   jcc(Assembler::equal, stub->entry());
4512   bind(stub->continuation());
4513 }
4514 
4515 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4516                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4517   switch(ideal_opc) {
4518     case Op_LShiftVS:
4519       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4520     case Op_LShiftVI:
4521       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4522     case Op_LShiftVL:
4523       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4524     case Op_RShiftVS:
4525       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4526     case Op_RShiftVI:
4527       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4528     case Op_RShiftVL:
4529       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4530     case Op_URShiftVS:
4531       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4532     case Op_URShiftVI:
4533       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4534     case Op_URShiftVL:
4535       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4536     case Op_RotateRightV:
4537       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4538     case Op_RotateLeftV:
4539       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4540     default:
4541       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4542       break;
4543   }
4544 }
4545 
4546 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4547                                                XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4548   if (is_unsigned) {
4549     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4550   } else {
4551     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4552   }
4553 }
4554 
4555 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4556                                                       XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4557   switch (elem_bt) {
4558     case T_BYTE:
4559       if (ideal_opc == Op_SaturatingAddV) {
4560         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4561       } else {
4562         assert(ideal_opc == Op_SaturatingSubV, "");
4563         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4564       }
4565       break;
4566     case T_SHORT:
4567       if (ideal_opc == Op_SaturatingAddV) {
4568         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4569       } else {
4570         assert(ideal_opc == Op_SaturatingSubV, "");
4571         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4572       }
4573       break;
4574     default:
4575       fatal("Unsupported type %s", type2name(elem_bt));
4576       break;
4577   }
4578 }
4579 
4580 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4581                                                         XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4582   switch (elem_bt) {
4583     case T_BYTE:
4584       if (ideal_opc == Op_SaturatingAddV) {
4585         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4586       } else {
4587         assert(ideal_opc == Op_SaturatingSubV, "");
4588         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4589       }
4590       break;
4591     case T_SHORT:
4592       if (ideal_opc == Op_SaturatingAddV) {
4593         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4594       } else {
4595         assert(ideal_opc == Op_SaturatingSubV, "");
4596         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4597       }
4598       break;
4599     default:
4600       fatal("Unsupported type %s", type2name(elem_bt));
4601       break;
4602   }
4603 }
4604 
4605 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4606                                                Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4607   if (is_unsigned) {
4608     evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4609   } else {
4610     evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4611   }
4612 }
4613 
4614 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4615                                                       XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4616   switch (elem_bt) {
4617     case T_BYTE:
4618       if (ideal_opc == Op_SaturatingAddV) {
4619         evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4620       } else {
4621         assert(ideal_opc == Op_SaturatingSubV, "");
4622         evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4623       }
4624       break;
4625     case T_SHORT:
4626       if (ideal_opc == Op_SaturatingAddV) {
4627         evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4628       } else {
4629         assert(ideal_opc == Op_SaturatingSubV, "");
4630         evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4631       }
4632       break;
4633     default:
4634       fatal("Unsupported type %s", type2name(elem_bt));
4635       break;
4636   }
4637 }
4638 
4639 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4640                                                         XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4641   switch (elem_bt) {
4642     case T_BYTE:
4643       if (ideal_opc == Op_SaturatingAddV) {
4644         evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4645       } else {
4646         assert(ideal_opc == Op_SaturatingSubV, "");
4647         evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4648       }
4649       break;
4650     case T_SHORT:
4651       if (ideal_opc == Op_SaturatingAddV) {
4652         evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4653       } else {
4654         assert(ideal_opc == Op_SaturatingSubV, "");
4655         evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4656       }
4657       break;
4658     default:
4659       fatal("Unsupported type %s", type2name(elem_bt));
4660       break;
4661   }
4662 }
4663 
4664 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4665                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4666                                     bool is_varshift) {
4667   switch (ideal_opc) {
4668     case Op_AddVB:
4669       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4670     case Op_AddVS:
4671       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4672     case Op_AddVI:
4673       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4674     case Op_AddVL:
4675       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4676     case Op_AddVF:
4677       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4678     case Op_AddVD:
4679       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4680     case Op_SubVB:
4681       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4682     case Op_SubVS:
4683       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4684     case Op_SubVI:
4685       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4686     case Op_SubVL:
4687       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4688     case Op_SubVF:
4689       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4690     case Op_SubVD:
4691       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4692     case Op_MulVS:
4693       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4694     case Op_MulVI:
4695       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4696     case Op_MulVL:
4697       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4698     case Op_MulVF:
4699       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4700     case Op_MulVD:
4701       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4702     case Op_DivVF:
4703       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4704     case Op_DivVD:
4705       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4706     case Op_SqrtVF:
4707       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4708     case Op_SqrtVD:
4709       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4710     case Op_AbsVB:
4711       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4712     case Op_AbsVS:
4713       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4714     case Op_AbsVI:
4715       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4716     case Op_AbsVL:
4717       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4718     case Op_FmaVF:
4719       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4720     case Op_FmaVD:
4721       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4722     case Op_VectorRearrange:
4723       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4724     case Op_LShiftVS:
4725       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4726     case Op_LShiftVI:
4727       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4728     case Op_LShiftVL:
4729       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4730     case Op_RShiftVS:
4731       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4732     case Op_RShiftVI:
4733       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4734     case Op_RShiftVL:
4735       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4736     case Op_URShiftVS:
4737       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4738     case Op_URShiftVI:
4739       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4740     case Op_URShiftVL:
4741       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4742     case Op_RotateLeftV:
4743       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4744     case Op_RotateRightV:
4745       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4746     case Op_MaxV:
4747       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4748     case Op_MinV:
4749       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4750     case Op_UMinV:
4751       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4752     case Op_UMaxV:
4753       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4754     case Op_XorV:
4755       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4756     case Op_OrV:
4757       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4758     case Op_AndV:
4759       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4760     default:
4761       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4762       break;
4763   }
4764 }
4765 
4766 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4767                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4768   switch (ideal_opc) {
4769     case Op_AddVB:
4770       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4771     case Op_AddVS:
4772       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4773     case Op_AddVI:
4774       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4775     case Op_AddVL:
4776       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4777     case Op_AddVF:
4778       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4779     case Op_AddVD:
4780       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4781     case Op_SubVB:
4782       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4783     case Op_SubVS:
4784       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4785     case Op_SubVI:
4786       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4787     case Op_SubVL:
4788       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4789     case Op_SubVF:
4790       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4791     case Op_SubVD:
4792       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4793     case Op_MulVS:
4794       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4795     case Op_MulVI:
4796       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4797     case Op_MulVL:
4798       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4799     case Op_MulVF:
4800       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4801     case Op_MulVD:
4802       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4803     case Op_DivVF:
4804       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4805     case Op_DivVD:
4806       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4807     case Op_FmaVF:
4808       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4809     case Op_FmaVD:
4810       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4811     case Op_MaxV:
4812       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4813     case Op_MinV:
4814       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4815     case Op_UMaxV:
4816       evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4817     case Op_UMinV:
4818       evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4819     case Op_XorV:
4820       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4821     case Op_OrV:
4822       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4823     case Op_AndV:
4824       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4825     default:
4826       fatal("Unsupported operation  %s", NodeClassNames[ideal_opc]);
4827       break;
4828   }
4829 }
4830 
4831 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4832                                   KRegister src1, KRegister src2) {
4833   BasicType etype = T_ILLEGAL;
4834   switch(mask_len) {
4835     case 2:
4836     case 4:
4837     case 8:  etype = T_BYTE; break;
4838     case 16: etype = T_SHORT; break;
4839     case 32: etype = T_INT; break;
4840     case 64: etype = T_LONG; break;
4841     default: fatal("Unsupported type"); break;
4842   }
4843   assert(etype != T_ILLEGAL, "");
4844   switch(ideal_opc) {
4845     case Op_AndVMask:
4846       kand(etype, dst, src1, src2); break;
4847     case Op_OrVMask:
4848       kor(etype, dst, src1, src2); break;
4849     case Op_XorVMask:
4850       kxor(etype, dst, src1, src2); break;
4851     default:
4852       fatal("Unsupported masked operation"); break;
4853   }
4854 }
4855 
4856 /*
4857  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4858  * If src is NaN, the result is 0.
4859  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4860  * the result is equal to the value of Integer.MIN_VALUE.
4861  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4862  * the result is equal to the value of Integer.MAX_VALUE.
4863  */
4864 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4865                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4866                                                                    Register rscratch, AddressLiteral float_sign_flip,
4867                                                                    int vec_enc) {
4868   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4869   Label done;
4870   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4871   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4872   vptest(xtmp2, xtmp2, vec_enc);
4873   jccb(Assembler::equal, done);
4874 
4875   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4876   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4877 
4878   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4879   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4880   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4881 
4882   // Recompute the mask for remaining special value.
4883   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4884   // Extract SRC values corresponding to TRUE mask lanes.
4885   vpand(xtmp4, xtmp2, src, vec_enc);
4886   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4887   // values are set.
4888   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4889 
4890   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4891   bind(done);
4892 }
4893 
4894 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4895                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4896                                                                     Register rscratch, AddressLiteral float_sign_flip,
4897                                                                     int vec_enc) {
4898   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4899   Label done;
4900   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4901   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4902   kortestwl(ktmp1, ktmp1);
4903   jccb(Assembler::equal, done);
4904 
4905   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4906   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4907   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4908 
4909   kxorwl(ktmp1, ktmp1, ktmp2);
4910   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4911   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4912   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4913   bind(done);
4914 }
4915 
4916 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4917                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4918                                                                      Register rscratch, AddressLiteral double_sign_flip,
4919                                                                      int vec_enc) {
4920   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4921 
4922   Label done;
4923   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4924   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4925   kortestwl(ktmp1, ktmp1);
4926   jccb(Assembler::equal, done);
4927 
4928   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4929   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4930   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4931 
4932   kxorwl(ktmp1, ktmp1, ktmp2);
4933   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4934   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4935   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4936   bind(done);
4937 }
4938 
4939 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4940                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4941                                                                      Register rscratch, AddressLiteral float_sign_flip,
4942                                                                      int vec_enc) {
4943   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4944   Label done;
4945   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4946   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4947   kortestwl(ktmp1, ktmp1);
4948   jccb(Assembler::equal, done);
4949 
4950   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4951   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4952   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4953 
4954   kxorwl(ktmp1, ktmp1, ktmp2);
4955   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4956   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4957   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4958   bind(done);
4959 }
4960 
4961 /*
4962  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4963  * If src is NaN, the result is 0.
4964  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4965  * the result is equal to the value of Long.MIN_VALUE.
4966  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4967  * the result is equal to the value of Long.MAX_VALUE.
4968  */
4969 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4970                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4971                                                                       Register rscratch, AddressLiteral double_sign_flip,
4972                                                                       int vec_enc) {
4973   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4974 
4975   Label done;
4976   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4977   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4978   kortestwl(ktmp1, ktmp1);
4979   jccb(Assembler::equal, done);
4980 
4981   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4982   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4983   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4984 
4985   kxorwl(ktmp1, ktmp1, ktmp2);
4986   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4987   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4988   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4989   bind(done);
4990 }
4991 
4992 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
4993                                                              XMMRegister xtmp, int index, int vec_enc) {
4994    assert(vec_enc < Assembler::AVX_512bit, "");
4995    if (vec_enc == Assembler::AVX_256bit) {
4996      vextractf128_high(xtmp, src);
4997      vshufps(dst, src, xtmp, index, vec_enc);
4998    } else {
4999      vshufps(dst, src, zero, index, vec_enc);
5000    }
5001 }
5002 
5003 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5004                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5005                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5006   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5007 
5008   Label done;
5009   // Compare the destination lanes with float_sign_flip
5010   // value to get mask for all special values.
5011   movdqu(xtmp1, float_sign_flip, rscratch);
5012   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5013   ptest(xtmp2, xtmp2);
5014   jccb(Assembler::equal, done);
5015 
5016   // Flip float_sign_flip to get max integer value.
5017   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5018   pxor(xtmp1, xtmp4);
5019 
5020   // Set detination lanes corresponding to unordered source lanes as zero.
5021   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5022   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5023 
5024   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5025   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5026   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5027 
5028   // Recompute the mask for remaining special value.
5029   pxor(xtmp2, xtmp3);
5030   // Extract mask corresponding to non-negative source lanes.
5031   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5032 
5033   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5034   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5035   pand(xtmp3, xtmp2);
5036 
5037   // Replace destination lanes holding special value(0x80000000) with max int
5038   // if corresponding source lane holds a +ve value.
5039   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5040   bind(done);
5041 }
5042 
5043 
5044 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5045                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5046   switch(to_elem_bt) {
5047     case T_SHORT:
5048       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5049       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5050       vpackusdw(dst, dst, zero, vec_enc);
5051       if (vec_enc == Assembler::AVX_256bit) {
5052         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5053       }
5054       break;
5055     case  T_BYTE:
5056       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5057       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5058       vpackusdw(dst, dst, zero, vec_enc);
5059       if (vec_enc == Assembler::AVX_256bit) {
5060         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5061       }
5062       vpackuswb(dst, dst, zero, vec_enc);
5063       break;
5064     default: assert(false, "Unexpected basic type for target of vector cast int to subword: %s", type2name(to_elem_bt));
5065   }
5066 }
5067 
5068 /*
5069  * Algorithm for vector D2L and F2I conversions (AVX 10.2 unsupported):-
5070  * a) Perform vector D2L/F2I cast.
5071  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5072  *    It signifies that source value could be any of the special floating point
5073  *    values(NaN,-Inf,Inf,Max,-Min).
5074  * c) Set destination to zero if source is NaN value.
5075  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5076  */
5077 
5078 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5079                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5080                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5081   int to_elem_sz = type2aelembytes(to_elem_bt);
5082   assert(to_elem_sz <= 4, "");
5083   vcvttps2dq(dst, src, vec_enc);
5084   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5085   if (to_elem_sz < 4) {
5086     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5087     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5088   }
5089 }
5090 
5091 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5092                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5093                                             Register rscratch, int vec_enc) {
5094   int to_elem_sz = type2aelembytes(to_elem_bt);
5095   assert(to_elem_sz <= 4, "");
5096   vcvttps2dq(dst, src, vec_enc);
5097   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5098   switch(to_elem_bt) {
5099     case T_INT:
5100       break;
5101     case T_SHORT:
5102       evpmovdw(dst, dst, vec_enc);
5103       break;
5104     case T_BYTE:
5105       evpmovdb(dst, dst, vec_enc);
5106       break;
5107     default: assert(false, "Unexpected basic type for target of vector castF2X EVEX: %s", type2name(to_elem_bt));
5108   }
5109 }
5110 
5111 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5112                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5113                                             Register rscratch, int vec_enc) {
5114   evcvttps2qq(dst, src, vec_enc);
5115   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5116 }
5117 
5118 // Handling for downcasting from double to integer or sub-word types on AVX2.
5119 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5120                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5121                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5122   int to_elem_sz = type2aelembytes(to_elem_bt);
5123   assert(to_elem_sz < 8, "");
5124   vcvttpd2dq(dst, src, vec_enc);
5125   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5126                                               float_sign_flip, vec_enc);
5127   if (to_elem_sz < 4) {
5128     // xtmp4 holds all zero lanes.
5129     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5130   }
5131 }
5132 
5133 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5134                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5135                                             KRegister ktmp2, AddressLiteral sign_flip,
5136                                             Register rscratch, int vec_enc) {
5137   if (VM_Version::supports_avx512dq()) {
5138     evcvttpd2qq(dst, src, vec_enc);
5139     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5140     switch(to_elem_bt) {
5141       case T_LONG:
5142         break;
5143       case T_INT:
5144         evpmovsqd(dst, dst, vec_enc);
5145         break;
5146       case T_SHORT:
5147         evpmovsqd(dst, dst, vec_enc);
5148         evpmovdw(dst, dst, vec_enc);
5149         break;
5150       case T_BYTE:
5151         evpmovsqd(dst, dst, vec_enc);
5152         evpmovdb(dst, dst, vec_enc);
5153         break;
5154       default: assert(false, "Unexpected basic type for target of vector castD2X AVX512DQ EVEX: %s", type2name(to_elem_bt));
5155     }
5156   } else {
5157     assert(type2aelembytes(to_elem_bt) <= 4, "");
5158     vcvttpd2dq(dst, src, vec_enc);
5159     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5160     switch(to_elem_bt) {
5161       case T_INT:
5162         break;
5163       case T_SHORT:
5164         evpmovdw(dst, dst, vec_enc);
5165         break;
5166       case T_BYTE:
5167         evpmovdb(dst, dst, vec_enc);
5168         break;
5169       default: assert(false, "Unexpected basic type for target of vector castD2X EVEX: %s", type2name(to_elem_bt));
5170     }
5171   }
5172 }
5173 
5174 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5175   switch(to_elem_bt) {
5176     case T_LONG:
5177       evcvttps2qqs(dst, src, vec_enc);
5178       break;
5179     case T_INT:
5180       evcvttps2dqs(dst, src, vec_enc);
5181       break;
5182     case T_SHORT:
5183       evcvttps2dqs(dst, src, vec_enc);
5184       evpmovdw(dst, dst, vec_enc);
5185       break;
5186     case T_BYTE:
5187       evcvttps2dqs(dst, src, vec_enc);
5188       evpmovdb(dst, dst, vec_enc);
5189       break;
5190     default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (reg src): %s", type2name(to_elem_bt));
5191   }
5192 }
5193 
5194 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5195   switch(to_elem_bt) {
5196     case T_LONG:
5197       evcvttps2qqs(dst, src, vec_enc);
5198       break;
5199     case T_INT:
5200       evcvttps2dqs(dst, src, vec_enc);
5201       break;
5202     case T_SHORT:
5203       evcvttps2dqs(dst, src, vec_enc);
5204       evpmovdw(dst, dst, vec_enc);
5205       break;
5206     case T_BYTE:
5207       evcvttps2dqs(dst, src, vec_enc);
5208       evpmovdb(dst, dst, vec_enc);
5209       break;
5210     default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (mem src): %s", type2name(to_elem_bt));
5211   }
5212 }
5213 
5214 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5215   switch(to_elem_bt) {
5216     case T_LONG:
5217       evcvttpd2qqs(dst, src, vec_enc);
5218       break;
5219     case T_INT:
5220       evcvttpd2dqs(dst, src, vec_enc);
5221       break;
5222     case T_SHORT:
5223       evcvttpd2dqs(dst, src, vec_enc);
5224       evpmovdw(dst, dst, vec_enc);
5225       break;
5226     case T_BYTE:
5227       evcvttpd2dqs(dst, src, vec_enc);
5228       evpmovdb(dst, dst, vec_enc);
5229       break;
5230     default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (reg src): %s", type2name(to_elem_bt));
5231   }
5232 }
5233 
5234 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5235   switch(to_elem_bt) {
5236     case T_LONG:
5237       evcvttpd2qqs(dst, src, vec_enc);
5238       break;
5239     case T_INT:
5240       evcvttpd2dqs(dst, src, vec_enc);
5241       break;
5242     case T_SHORT:
5243       evcvttpd2dqs(dst, src, vec_enc);
5244       evpmovdw(dst, dst, vec_enc);
5245       break;
5246     case T_BYTE:
5247       evcvttpd2dqs(dst, src, vec_enc);
5248       evpmovdb(dst, dst, vec_enc);
5249       break;
5250     default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (mem src): %s", type2name(to_elem_bt));
5251   }
5252 }
5253 
5254 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5255                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5256                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5257   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5258   // and re-instantiate original MXCSR.RC mode after that.
5259   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5260 
5261   mov64(tmp, julong_cast(0.5L));
5262   evpbroadcastq(xtmp1, tmp, vec_enc);
5263   vaddpd(xtmp1, src , xtmp1, vec_enc);
5264   evcvtpd2qq(dst, xtmp1, vec_enc);
5265   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5266                                                 double_sign_flip, vec_enc);;
5267 
5268   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5269 }
5270 
5271 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5272                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5273                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5274   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5275   // and re-instantiate original MXCSR.RC mode after that.
5276   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5277 
5278   movl(tmp, jint_cast(0.5));
5279   movq(xtmp1, tmp);
5280   vbroadcastss(xtmp1, xtmp1, vec_enc);
5281   vaddps(xtmp1, src , xtmp1, vec_enc);
5282   vcvtps2dq(dst, xtmp1, vec_enc);
5283   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5284                                               float_sign_flip, vec_enc);
5285 
5286   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5287 }
5288 
5289 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5290                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5291                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5292   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5293   // and re-instantiate original MXCSR.RC mode after that.
5294   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5295 
5296   movl(tmp, jint_cast(0.5));
5297   movq(xtmp1, tmp);
5298   vbroadcastss(xtmp1, xtmp1, vec_enc);
5299   vaddps(xtmp1, src , xtmp1, vec_enc);
5300   vcvtps2dq(dst, xtmp1, vec_enc);
5301   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5302 
5303   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5304 }
5305 
5306 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5307                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5308   switch (from_elem_bt) {
5309     case T_BYTE:
5310       switch (to_elem_bt) {
5311         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5312         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5313         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5314         default: ShouldNotReachHere();
5315       }
5316       break;
5317     case T_SHORT:
5318       switch (to_elem_bt) {
5319         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5320         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5321         default: ShouldNotReachHere();
5322       }
5323       break;
5324     case T_INT:
5325       assert(to_elem_bt == T_LONG, "");
5326       vpmovzxdq(dst, src, vlen_enc);
5327       break;
5328     default:
5329       ShouldNotReachHere();
5330   }
5331 }
5332 
5333 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5334                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5335   switch (from_elem_bt) {
5336     case T_BYTE:
5337       switch (to_elem_bt) {
5338         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5339         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5340         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5341         default: ShouldNotReachHere();
5342       }
5343       break;
5344     case T_SHORT:
5345       switch (to_elem_bt) {
5346         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5347         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5348         default: ShouldNotReachHere();
5349       }
5350       break;
5351     case T_INT:
5352       assert(to_elem_bt == T_LONG, "");
5353       vpmovsxdq(dst, src, vlen_enc);
5354       break;
5355     default:
5356       ShouldNotReachHere();
5357   }
5358 }
5359 
5360 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5361                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5362   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5363   assert(vlen_enc != AVX_512bit, "");
5364 
5365   int dst_bt_size = type2aelembytes(dst_bt);
5366   int src_bt_size = type2aelembytes(src_bt);
5367   if (dst_bt_size > src_bt_size) {
5368     switch (dst_bt_size / src_bt_size) {
5369       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5370       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5371       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5372       default: ShouldNotReachHere();
5373     }
5374   } else {
5375     assert(dst_bt_size < src_bt_size, "");
5376     switch (src_bt_size / dst_bt_size) {
5377       case 2: {
5378         if (vlen_enc == AVX_128bit) {
5379           vpacksswb(dst, src, src, vlen_enc);
5380         } else {
5381           vpacksswb(dst, src, src, vlen_enc);
5382           vpermq(dst, dst, 0x08, vlen_enc);
5383         }
5384         break;
5385       }
5386       case 4: {
5387         if (vlen_enc == AVX_128bit) {
5388           vpackssdw(dst, src, src, vlen_enc);
5389           vpacksswb(dst, dst, dst, vlen_enc);
5390         } else {
5391           vpackssdw(dst, src, src, vlen_enc);
5392           vpermq(dst, dst, 0x08, vlen_enc);
5393           vpacksswb(dst, dst, dst, AVX_128bit);
5394         }
5395         break;
5396       }
5397       case 8: {
5398         if (vlen_enc == AVX_128bit) {
5399           vpshufd(dst, src, 0x08, vlen_enc);
5400           vpackssdw(dst, dst, dst, vlen_enc);
5401           vpacksswb(dst, dst, dst, vlen_enc);
5402         } else {
5403           vpshufd(dst, src, 0x08, vlen_enc);
5404           vpermq(dst, dst, 0x08, vlen_enc);
5405           vpackssdw(dst, dst, dst, AVX_128bit);
5406           vpacksswb(dst, dst, dst, AVX_128bit);
5407         }
5408         break;
5409       }
5410       default: ShouldNotReachHere();
5411     }
5412   }
5413 }
5414 
5415 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5416                                    bool merge, BasicType bt, int vlen_enc) {
5417   if (bt == T_INT) {
5418     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5419   } else {
5420     assert(bt == T_LONG, "");
5421     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5422   }
5423 }
5424 
5425 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5426                                    bool merge, BasicType bt, int vlen_enc) {
5427   if (bt == T_INT) {
5428     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5429   } else {
5430     assert(bt == T_LONG, "");
5431     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5432   }
5433 }
5434 
5435 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5436                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5437                                                int vec_enc) {
5438   int index = 0;
5439   int vindex = 0;
5440   mov64(rtmp1, 0x0101010101010101L);
5441   pdepq(rtmp1, src, rtmp1);
5442   if (mask_len > 8) {
5443     movq(rtmp2, src);
5444     vpxor(xtmp, xtmp, xtmp, vec_enc);
5445     movq(xtmp, rtmp1);
5446   }
5447   movq(dst, rtmp1);
5448 
5449   mask_len -= 8;
5450   while (mask_len > 0) {
5451     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5452     index++;
5453     if ((index % 2) == 0) {
5454       pxor(xtmp, xtmp);
5455     }
5456     mov64(rtmp1, 0x0101010101010101L);
5457     shrq(rtmp2, 8);
5458     pdepq(rtmp1, rtmp2, rtmp1);
5459     pinsrq(xtmp, rtmp1, index % 2);
5460     vindex = index / 2;
5461     if (vindex) {
5462       // Write entire 16 byte vector when both 64 bit
5463       // lanes are update to save redundant instructions.
5464       if (index % 2) {
5465         vinsertf128(dst, dst, xtmp, vindex);
5466       }
5467     } else {
5468       vmovdqu(dst, xtmp);
5469     }
5470     mask_len -= 8;
5471   }
5472 }
5473 
5474 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5475   switch(opc) {
5476     case Op_VectorMaskTrueCount:
5477       popcntq(dst, tmp);
5478       break;
5479     case Op_VectorMaskLastTrue:
5480       if (VM_Version::supports_lzcnt()) {
5481         lzcntq(tmp, tmp);
5482         movl(dst, 63);
5483         subl(dst, tmp);
5484       } else {
5485         movl(dst, -1);
5486         bsrq(tmp, tmp);
5487         cmov32(Assembler::notZero, dst, tmp);
5488       }
5489       break;
5490     case Op_VectorMaskFirstTrue:
5491       if (VM_Version::supports_bmi1()) {
5492         if (masklen < 32) {
5493           orl(tmp, 1 << masklen);
5494           tzcntl(dst, tmp);
5495         } else if (masklen == 32) {
5496           tzcntl(dst, tmp);
5497         } else {
5498           assert(masklen == 64, "");
5499           tzcntq(dst, tmp);
5500         }
5501       } else {
5502         if (masklen < 32) {
5503           orl(tmp, 1 << masklen);
5504           bsfl(dst, tmp);
5505         } else {
5506           assert(masklen == 32 || masklen == 64, "");
5507           movl(dst, masklen);
5508           if (masklen == 32)  {
5509             bsfl(tmp, tmp);
5510           } else {
5511             bsfq(tmp, tmp);
5512           }
5513           cmov32(Assembler::notZero, dst, tmp);
5514         }
5515       }
5516       break;
5517     case Op_VectorMaskToLong:
5518       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5519       break;
5520     default: assert(false, "Unhandled mask operation");
5521   }
5522 }
5523 
5524 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5525                                               int masklen, int masksize, int vec_enc) {
5526   assert(VM_Version::supports_popcnt(), "");
5527 
5528   if(VM_Version::supports_avx512bw()) {
5529     kmovql(tmp, mask);
5530   } else {
5531     assert(masklen <= 16, "");
5532     kmovwl(tmp, mask);
5533   }
5534 
5535   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5536   // operations needs to be clipped.
5537   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5538     andq(tmp, (1 << masklen) - 1);
5539   }
5540 
5541   vector_mask_operation_helper(opc, dst, tmp, masklen);
5542 }
5543 
5544 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5545                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5546   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5547          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5548   assert(VM_Version::supports_popcnt(), "");
5549 
5550   bool need_clip = false;
5551   switch(bt) {
5552     case T_BOOLEAN:
5553       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5554       vpxor(xtmp, xtmp, xtmp, vec_enc);
5555       vpsubb(xtmp, xtmp, mask, vec_enc);
5556       vpmovmskb(tmp, xtmp, vec_enc);
5557       need_clip = masklen < 16;
5558       break;
5559     case T_BYTE:
5560       vpmovmskb(tmp, mask, vec_enc);
5561       need_clip = masklen < 16;
5562       break;
5563     case T_SHORT:
5564       vpacksswb(xtmp, mask, mask, vec_enc);
5565       if (masklen >= 16) {
5566         vpermpd(xtmp, xtmp, 8, vec_enc);
5567       }
5568       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5569       need_clip = masklen < 16;
5570       break;
5571     case T_INT:
5572     case T_FLOAT:
5573       vmovmskps(tmp, mask, vec_enc);
5574       need_clip = masklen < 4;
5575       break;
5576     case T_LONG:
5577     case T_DOUBLE:
5578       vmovmskpd(tmp, mask, vec_enc);
5579       need_clip = masklen < 2;
5580       break;
5581     default: assert(false, "Unhandled type, %s", type2name(bt));
5582   }
5583 
5584   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5585   // operations needs to be clipped.
5586   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5587     // need_clip implies masklen < 32
5588     andq(tmp, (1 << masklen) - 1);
5589   }
5590 
5591   vector_mask_operation_helper(opc, dst, tmp, masklen);
5592 }
5593 
5594 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5595                                              Register rtmp2, int mask_len) {
5596   kmov(rtmp1, src);
5597   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5598   mov64(rtmp2, -1L);
5599   pextq(rtmp2, rtmp2, rtmp1);
5600   kmov(dst, rtmp2);
5601 }
5602 
5603 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5604                                                     XMMRegister mask, Register rtmp, Register rscratch,
5605                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5606                                                     int vec_enc) {
5607   assert(type2aelembytes(bt) >= 4, "");
5608   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5609   address compress_perm_table = nullptr;
5610   address expand_perm_table = nullptr;
5611   if (type2aelembytes(bt) == 8) {
5612     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5613     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5614     vmovmskpd(rtmp, mask, vec_enc);
5615   } else {
5616     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5617     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5618     vmovmskps(rtmp, mask, vec_enc);
5619   }
5620   shlq(rtmp, 5); // for 32 byte permute row.
5621   if (opcode == Op_CompressV) {
5622     lea(rscratch, ExternalAddress(compress_perm_table));
5623   } else {
5624     lea(rscratch, ExternalAddress(expand_perm_table));
5625   }
5626   addptr(rtmp, rscratch);
5627   vmovdqu(permv, Address(rtmp));
5628   vpermps(dst, permv, src, Assembler::AVX_256bit);
5629   vpxor(xtmp, xtmp, xtmp, vec_enc);
5630   // Blend the result with zero vector using permute mask, each column entry
5631   // in a permute table row contains either a valid permute index or a -1 (default)
5632   // value, this can potentially be used as a blending mask after
5633   // compressing/expanding the source vector lanes.
5634   vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv);
5635 }
5636 
5637 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5638                                                bool merge, BasicType bt, int vec_enc) {
5639   if (opcode == Op_CompressV) {
5640     switch(bt) {
5641     case T_BYTE:
5642       evpcompressb(dst, mask, src, merge, vec_enc);
5643       break;
5644     case T_CHAR:
5645     case T_SHORT:
5646       evpcompressw(dst, mask, src, merge, vec_enc);
5647       break;
5648     case T_INT:
5649       evpcompressd(dst, mask, src, merge, vec_enc);
5650       break;
5651     case T_FLOAT:
5652       evcompressps(dst, mask, src, merge, vec_enc);
5653       break;
5654     case T_LONG:
5655       evpcompressq(dst, mask, src, merge, vec_enc);
5656       break;
5657     case T_DOUBLE:
5658       evcompresspd(dst, mask, src, merge, vec_enc);
5659       break;
5660     default:
5661       fatal("Unsupported type %s", type2name(bt));
5662       break;
5663     }
5664   } else {
5665     assert(opcode == Op_ExpandV, "");
5666     switch(bt) {
5667     case T_BYTE:
5668       evpexpandb(dst, mask, src, merge, vec_enc);
5669       break;
5670     case T_CHAR:
5671     case T_SHORT:
5672       evpexpandw(dst, mask, src, merge, vec_enc);
5673       break;
5674     case T_INT:
5675       evpexpandd(dst, mask, src, merge, vec_enc);
5676       break;
5677     case T_FLOAT:
5678       evexpandps(dst, mask, src, merge, vec_enc);
5679       break;
5680     case T_LONG:
5681       evpexpandq(dst, mask, src, merge, vec_enc);
5682       break;
5683     case T_DOUBLE:
5684       evexpandpd(dst, mask, src, merge, vec_enc);
5685       break;
5686     default:
5687       fatal("Unsupported type %s", type2name(bt));
5688       break;
5689     }
5690   }
5691 }
5692 
5693 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5694                                            KRegister ktmp1, int vec_enc) {
5695   if (opcode == Op_SignumVD) {
5696     vsubpd(dst, zero, one, vec_enc);
5697     // if src < 0 ? -1 : 1
5698     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5699     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5700     // if src == NaN, -0.0 or 0.0 return src.
5701     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5702     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5703   } else {
5704     assert(opcode == Op_SignumVF, "");
5705     vsubps(dst, zero, one, vec_enc);
5706     // if src < 0 ? -1 : 1
5707     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5708     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5709     // if src == NaN, -0.0 or 0.0 return src.
5710     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5711     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5712   }
5713 }
5714 
5715 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5716                                           XMMRegister xtmp1, int vec_enc) {
5717   if (opcode == Op_SignumVD) {
5718     vsubpd(dst, zero, one, vec_enc);
5719     // if src < 0 ? -1 : 1
5720     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5721     // if src == NaN, -0.0 or 0.0 return src.
5722     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5723     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5724   } else {
5725     assert(opcode == Op_SignumVF, "");
5726     vsubps(dst, zero, one, vec_enc);
5727     // if src < 0 ? -1 : 1
5728     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5729     // if src == NaN, -0.0 or 0.0 return src.
5730     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5731     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5732   }
5733 }
5734 
5735 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5736   if (VM_Version::supports_avx512bw()) {
5737     if (mask_len > 32) {
5738       kmovql(dst, src);
5739     } else {
5740       kmovdl(dst, src);
5741       if (mask_len != 32) {
5742         kshiftrdl(dst, dst, 32 - mask_len);
5743       }
5744     }
5745   } else {
5746     assert(mask_len <= 16, "");
5747     kmovwl(dst, src);
5748     if (mask_len != 16) {
5749       kshiftrwl(dst, dst, 16 - mask_len);
5750     }
5751   }
5752 }
5753 
5754 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5755   int lane_size = type2aelembytes(bt);
5756   if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5757       (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5758     movptr(rtmp, imm32);
5759     switch(lane_size) {
5760       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5761       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5762       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5763       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5764       fatal("Unsupported lane size %d", lane_size);
5765       break;
5766     }
5767   } else {
5768     movptr(rtmp, imm32);
5769     movq(dst, rtmp);
5770     switch(lane_size) {
5771       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5772       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5773       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5774       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5775       fatal("Unsupported lane size %d", lane_size);
5776       break;
5777     }
5778   }
5779 }
5780 
5781 //
5782 // Following is lookup table based popcount computation algorithm:-
5783 //       Index   Bit set count
5784 //     [ 0000 ->   0,
5785 //       0001 ->   1,
5786 //       0010 ->   1,
5787 //       0011 ->   2,
5788 //       0100 ->   1,
5789 //       0101 ->   2,
5790 //       0110 ->   2,
5791 //       0111 ->   3,
5792 //       1000 ->   1,
5793 //       1001 ->   2,
5794 //       1010 ->   3,
5795 //       1011 ->   3,
5796 //       1100 ->   2,
5797 //       1101 ->   3,
5798 //       1111 ->   4 ]
5799 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5800 //     shuffle indices for lookup table access.
5801 //  b. Right shift each byte of vector lane by 4 positions.
5802 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5803 //     shuffle indices for lookup table access.
5804 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5805 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5806 //     count of all the bytes of a quadword.
5807 //  f. Perform step e. for upper 128bit vector lane.
5808 //  g. Pack the bitset count of quadwords back to double word.
5809 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5810 
5811 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5812                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5813   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5814   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5815   vpsrlw(dst, src, 4, vec_enc);
5816   vpand(dst, dst, xtmp1, vec_enc);
5817   vpand(xtmp1, src, xtmp1, vec_enc);
5818   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5819   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5820   vpshufb(dst, xtmp2, dst, vec_enc);
5821   vpaddb(dst, dst, xtmp1, vec_enc);
5822 }
5823 
5824 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5825                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5826   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5827   // Following code is as per steps e,f,g and h of above algorithm.
5828   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5829   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5830   vpsadbw(dst, dst, xtmp2, vec_enc);
5831   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5832   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5833   vpackuswb(dst, xtmp1, dst, vec_enc);
5834 }
5835 
5836 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5837                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5838   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5839   // Add the popcount of upper and lower bytes of word.
5840   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5841   vpsrlw(dst, xtmp1, 8, vec_enc);
5842   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5843   vpaddw(dst, dst, xtmp1, vec_enc);
5844 }
5845 
5846 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5847                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5848   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5849   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5850   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5851 }
5852 
5853 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5854                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5855   switch(bt) {
5856     case T_LONG:
5857       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5858       break;
5859     case T_INT:
5860       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5861       break;
5862     case T_CHAR:
5863     case T_SHORT:
5864       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5865       break;
5866     case T_BYTE:
5867     case T_BOOLEAN:
5868       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5869       break;
5870     default:
5871       fatal("Unsupported type %s", type2name(bt));
5872       break;
5873   }
5874 }
5875 
5876 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5877                                                       KRegister mask, bool merge, int vec_enc) {
5878   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5879   switch(bt) {
5880     case T_LONG:
5881       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5882       evpopcntq(dst, mask, src, merge, vec_enc);
5883       break;
5884     case T_INT:
5885       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5886       evpopcntd(dst, mask, src, merge, vec_enc);
5887       break;
5888     case T_CHAR:
5889     case T_SHORT:
5890       assert(VM_Version::supports_avx512_bitalg(), "");
5891       evpopcntw(dst, mask, src, merge, vec_enc);
5892       break;
5893     case T_BYTE:
5894     case T_BOOLEAN:
5895       assert(VM_Version::supports_avx512_bitalg(), "");
5896       evpopcntb(dst, mask, src, merge, vec_enc);
5897       break;
5898     default:
5899       fatal("Unsupported type %s", type2name(bt));
5900       break;
5901   }
5902 }
5903 
5904 // Bit reversal algorithm first reverses the bits of each byte followed by
5905 // a byte level reversal for multi-byte primitive types (short/int/long).
5906 // Algorithm performs a lookup table access to get reverse bit sequence
5907 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5908 // is obtained by swapping the reverse bit sequences of upper and lower
5909 // nibble of a byte.
5910 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5911                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5912   if (VM_Version::supports_avx512vlbw()) {
5913 
5914     // Get the reverse bit sequence of lower nibble of each byte.
5915     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5916     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5917     evpandq(dst, xtmp2, src, vec_enc);
5918     vpshufb(dst, xtmp1, dst, vec_enc);
5919     vpsllq(dst, dst, 4, vec_enc);
5920 
5921     // Get the reverse bit sequence of upper nibble of each byte.
5922     vpandn(xtmp2, xtmp2, src, vec_enc);
5923     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5924     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5925 
5926     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5927     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5928     evporq(xtmp2, dst, xtmp2, vec_enc);
5929     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5930 
5931   } else if(vec_enc == Assembler::AVX_512bit) {
5932     // Shift based bit reversal.
5933     assert(bt == T_LONG || bt == T_INT, "");
5934 
5935     // Swap lower and upper nibble of each byte.
5936     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5937 
5938     // Swap two least and most significant bits of each nibble.
5939     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5940 
5941     // Swap adjacent pair of bits.
5942     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5943     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5944 
5945     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5946     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5947   } else {
5948     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5949     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5950 
5951     // Get the reverse bit sequence of lower nibble of each byte.
5952     vpand(dst, xtmp2, src, vec_enc);
5953     vpshufb(dst, xtmp1, dst, vec_enc);
5954     vpsllq(dst, dst, 4, vec_enc);
5955 
5956     // Get the reverse bit sequence of upper nibble of each byte.
5957     vpandn(xtmp2, xtmp2, src, vec_enc);
5958     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5959     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5960 
5961     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5962     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5963     vpor(xtmp2, dst, xtmp2, vec_enc);
5964     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5965   }
5966 }
5967 
5968 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5969                                                 XMMRegister xtmp, Register rscratch) {
5970   assert(VM_Version::supports_gfni(), "");
5971   assert(rscratch != noreg || always_reachable(mask), "missing");
5972 
5973   // Galois field instruction based bit reversal based on following algorithm.
5974   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5975   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5976   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5977   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5978 }
5979 
5980 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5981                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5982   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5983   evpandq(dst, xtmp1, src, vec_enc);
5984   vpsllq(dst, dst, nbits, vec_enc);
5985   vpandn(xtmp1, xtmp1, src, vec_enc);
5986   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5987   evporq(dst, dst, xtmp1, vec_enc);
5988 }
5989 
5990 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5991                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5992   // Shift based bit reversal.
5993   assert(VM_Version::supports_evex(), "");
5994   switch(bt) {
5995     case T_LONG:
5996       // Swap upper and lower double word of each quad word.
5997       evprorq(xtmp1, k0, src, 32, true, vec_enc);
5998       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5999       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6000       break;
6001     case T_INT:
6002       // Swap upper and lower word of each double word.
6003       evprord(xtmp1, k0, src, 16, true, vec_enc);
6004       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6005       break;
6006     case T_CHAR:
6007     case T_SHORT:
6008       // Swap upper and lower byte of each word.
6009       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6010       break;
6011     case T_BYTE:
6012       evmovdquq(dst, k0, src, true, vec_enc);
6013       break;
6014     default:
6015       fatal("Unsupported type %s", type2name(bt));
6016       break;
6017   }
6018 }
6019 
6020 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6021   if (bt == T_BYTE) {
6022     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6023       evmovdquq(dst, k0, src, true, vec_enc);
6024     } else {
6025       vmovdqu(dst, src);
6026     }
6027     return;
6028   }
6029   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6030   // pre-computed shuffle indices.
6031   switch(bt) {
6032     case T_LONG:
6033       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6034       break;
6035     case T_INT:
6036       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6037       break;
6038     case T_CHAR:
6039     case T_SHORT:
6040       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6041       break;
6042     default:
6043       fatal("Unsupported type %s", type2name(bt));
6044       break;
6045   }
6046   vpshufb(dst, src, dst, vec_enc);
6047 }
6048 
6049 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6050                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6051                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6052   assert(is_integral_type(bt), "");
6053   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6054   assert(VM_Version::supports_avx512cd(), "");
6055   switch(bt) {
6056     case T_LONG:
6057       evplzcntq(dst, ktmp, src, merge, vec_enc);
6058       break;
6059     case T_INT:
6060       evplzcntd(dst, ktmp, src, merge, vec_enc);
6061       break;
6062     case T_SHORT:
6063       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6064       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6065       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6066       vpunpckhwd(dst, xtmp1, src, vec_enc);
6067       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6068       vpackusdw(dst, xtmp2, dst, vec_enc);
6069       break;
6070     case T_BYTE:
6071       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6072       // accessing the lookup table.
6073       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6074       // accessing the lookup table.
6075       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6076       assert(VM_Version::supports_avx512bw(), "");
6077       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6078       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6079       vpand(xtmp2, dst, src, vec_enc);
6080       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6081       vpsrlw(xtmp3, src, 4, vec_enc);
6082       vpand(xtmp3, dst, xtmp3, vec_enc);
6083       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6084       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6085       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6086       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6087       break;
6088     default:
6089       fatal("Unsupported type %s", type2name(bt));
6090       break;
6091   }
6092 }
6093 
6094 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6095                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6096   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6097   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6098   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6099   // accessing the lookup table.
6100   vpand(dst, xtmp2, src, vec_enc);
6101   vpshufb(dst, xtmp1, dst, vec_enc);
6102   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6103   // accessing the lookup table.
6104   vpsrlw(xtmp3, src, 4, vec_enc);
6105   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6106   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6107   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6108   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6109   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6110   vpaddb(dst, dst, xtmp2, vec_enc);
6111   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6112 }
6113 
6114 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6115                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6116   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6117   // Add zero counts of lower byte and upper byte of a word if
6118   // upper byte holds a zero value.
6119   vpsrlw(xtmp3, src, 8, vec_enc);
6120   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6121   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6122   vpsllw(xtmp2, dst, 8, vec_enc);
6123   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6124   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6125   vpsrlw(dst, dst, 8, vec_enc);
6126 }
6127 
6128 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6129                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6130   // By converting the integer to a float, we can obtain the number of leading zeros based on the exponent of the float.
6131   // As the float exponent contains a bias of 127 for nonzero values, the bias must be removed before interpreting the
6132   // exponent as the leading zero count.
6133 
6134   // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6135   // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6136   // contributes to the leading number of zeros.
6137   vpsrld(dst, src, 1, vec_enc);
6138   vpandn(dst, dst, src, vec_enc);
6139 
6140   vcvtdq2ps(dst, dst, vec_enc);
6141 
6142   // By comparing the register to itself, all the bits in the destination are set.
6143   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6144 
6145   // Move the biased exponent to the low end of the lane and mask with 0xFF to discard the sign bit.
6146   vpsrld(xtmp2, xtmp1, 24, vec_enc);
6147   vpsrld(dst, dst, 23, vec_enc);
6148   vpand(dst, xtmp2, dst, vec_enc);
6149 
6150   // Subtract 127 from the exponent, which removes the bias from the exponent.
6151   vpsrld(xtmp2, xtmp1, 25, vec_enc);
6152   vpsubd(dst, dst, xtmp2, vec_enc);
6153 
6154   vpsrld(xtmp2, xtmp1, 27, vec_enc);
6155 
6156   // If the original value is 0 the exponent would not have bias, so the subtraction creates a negative number. If this
6157   // is found in any of the lanes, replace the lane with -1 from xtmp1.
6158   vblendvps(dst, dst, xtmp1, dst, vec_enc, true, xtmp3);
6159 
6160   // If the original value is negative, replace the lane with 31.
6161   vblendvps(dst, dst, xtmp2, src, vec_enc, true, xtmp3);
6162 
6163   // Subtract the exponent from 31, giving the final result. For 0, the result is 32 as the exponent was replaced with -1,
6164   // and for negative numbers the result is 0 as the exponent was replaced with 31.
6165   vpsubd(dst, xtmp2, dst, vec_enc);
6166 }
6167 
6168 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6169                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6170   // Find the leading zeros of the top and bottom halves of the long individually.
6171   vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6172 
6173   // Move the top half result to the bottom half of xtmp1, setting the top half to 0.
6174   vpsrlq(xtmp1, dst, 32, vec_enc);
6175   // By moving the top half result to the right by 6 bits, if the top half was empty (i.e. 32 is returned) the result bit will
6176   // be in the most significant position of the bottom half.
6177   vpsrlq(xtmp2, dst, 6, vec_enc);
6178 
6179   // In the bottom half, add the top half and bottom half results.
6180   vpaddq(dst, xtmp1, dst, vec_enc);
6181 
6182   // For the bottom half, choose between the values using the most significant bit of xtmp2.
6183   // If the MSB is set, then bottom+top in dst is the resulting value. If the top half is less than 32 xtmp1 is chosen,
6184   // which contains only the top half result.
6185   // In the top half the MSB is always zero, so the value in xtmp1 is always chosen. This value is always 0, which clears
6186   // the lane as required.
6187   vblendvps(dst, xtmp1, dst, xtmp2, vec_enc, true, xtmp3);
6188 }
6189 
6190 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6191                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6192                                                        Register rtmp, int vec_enc) {
6193   assert(is_integral_type(bt), "unexpected type");
6194   assert(vec_enc < Assembler::AVX_512bit, "");
6195   switch(bt) {
6196     case T_LONG:
6197       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6198       break;
6199     case T_INT:
6200       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6201       break;
6202     case T_SHORT:
6203       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6204       break;
6205     case T_BYTE:
6206       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6207       break;
6208     default:
6209       fatal("Unsupported type %s", type2name(bt));
6210       break;
6211   }
6212 }
6213 
6214 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6215   switch(bt) {
6216     case T_BYTE:
6217       vpsubb(dst, src1, src2, vec_enc);
6218       break;
6219     case T_SHORT:
6220       vpsubw(dst, src1, src2, vec_enc);
6221       break;
6222     case T_INT:
6223       vpsubd(dst, src1, src2, vec_enc);
6224       break;
6225     case T_LONG:
6226       vpsubq(dst, src1, src2, vec_enc);
6227       break;
6228     default:
6229       fatal("Unsupported type %s", type2name(bt));
6230       break;
6231   }
6232 }
6233 
6234 // Trailing zero count computation is based on leading zero count operation as per
6235 // following equation. All AVX3 targets support AVX512CD feature which offers
6236 // direct vector instruction to compute leading zero count.
6237 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6238 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6239                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6240                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6241   assert(is_integral_type(bt), "");
6242   // xtmp = -1
6243   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6244   // xtmp = xtmp + src
6245   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6246   // xtmp = xtmp & ~src
6247   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6248   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6249   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6250   vpsub(bt, dst, xtmp4, dst, vec_enc);
6251 }
6252 
6253 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6254 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6255 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6256                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6257   assert(is_integral_type(bt), "");
6258   // xtmp = 0
6259   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6260   // xtmp = 0 - src
6261   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6262   // xtmp = xtmp | src
6263   vpor(xtmp3, xtmp3, src, vec_enc);
6264   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6265   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6266   vpsub(bt, dst, xtmp1, dst, vec_enc);
6267 }
6268 
6269 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6270   Label done;
6271   Label neg_divisor_fastpath;
6272   cmpl(divisor, 0);
6273   jccb(Assembler::less, neg_divisor_fastpath);
6274   xorl(rdx, rdx);
6275   divl(divisor);
6276   jmpb(done);
6277   bind(neg_divisor_fastpath);
6278   // Fastpath for divisor < 0:
6279   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6280   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6281   movl(rdx, rax);
6282   subl(rdx, divisor);
6283   if (VM_Version::supports_bmi1()) {
6284     andnl(rax, rdx, rax);
6285   } else {
6286     notl(rdx);
6287     andl(rax, rdx);
6288   }
6289   shrl(rax, 31);
6290   bind(done);
6291 }
6292 
6293 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6294   Label done;
6295   Label neg_divisor_fastpath;
6296   cmpl(divisor, 0);
6297   jccb(Assembler::less, neg_divisor_fastpath);
6298   xorl(rdx, rdx);
6299   divl(divisor);
6300   jmpb(done);
6301   bind(neg_divisor_fastpath);
6302   // Fastpath when divisor < 0:
6303   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6304   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6305   movl(rdx, rax);
6306   subl(rax, divisor);
6307   if (VM_Version::supports_bmi1()) {
6308     andnl(rax, rax, rdx);
6309   } else {
6310     notl(rax);
6311     andl(rax, rdx);
6312   }
6313   sarl(rax, 31);
6314   andl(rax, divisor);
6315   subl(rdx, rax);
6316   bind(done);
6317 }
6318 
6319 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6320   Label done;
6321   Label neg_divisor_fastpath;
6322 
6323   cmpl(divisor, 0);
6324   jccb(Assembler::less, neg_divisor_fastpath);
6325   xorl(rdx, rdx);
6326   divl(divisor);
6327   jmpb(done);
6328   bind(neg_divisor_fastpath);
6329   // Fastpath for divisor < 0:
6330   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6331   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6332   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6333   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6334   movl(rdx, rax);
6335   subl(rax, divisor);
6336   if (VM_Version::supports_bmi1()) {
6337     andnl(rax, rax, rdx);
6338   } else {
6339     notl(rax);
6340     andl(rax, rdx);
6341   }
6342   movl(tmp, rax);
6343   shrl(rax, 31); // quotient
6344   sarl(tmp, 31);
6345   andl(tmp, divisor);
6346   subl(rdx, tmp); // remainder
6347   bind(done);
6348 }
6349 
6350 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6351                                  XMMRegister xtmp2, Register rtmp) {
6352   if(VM_Version::supports_gfni()) {
6353     // Galois field instruction based bit reversal based on following algorithm.
6354     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6355     mov64(rtmp, 0x8040201008040201L);
6356     movq(xtmp1, src);
6357     movq(xtmp2, rtmp);
6358     gf2p8affineqb(xtmp1, xtmp2, 0);
6359     movq(dst, xtmp1);
6360   } else {
6361     // Swap even and odd numbered bits.
6362     movl(rtmp, src);
6363     andl(rtmp, 0x55555555);
6364     shll(rtmp, 1);
6365     movl(dst, src);
6366     andl(dst, 0xAAAAAAAA);
6367     shrl(dst, 1);
6368     orl(dst, rtmp);
6369 
6370     // Swap LSB and MSB 2 bits of each nibble.
6371     movl(rtmp, dst);
6372     andl(rtmp, 0x33333333);
6373     shll(rtmp, 2);
6374     andl(dst, 0xCCCCCCCC);
6375     shrl(dst, 2);
6376     orl(dst, rtmp);
6377 
6378     // Swap LSB and MSB 4 bits of each byte.
6379     movl(rtmp, dst);
6380     andl(rtmp, 0x0F0F0F0F);
6381     shll(rtmp, 4);
6382     andl(dst, 0xF0F0F0F0);
6383     shrl(dst, 4);
6384     orl(dst, rtmp);
6385   }
6386   bswapl(dst);
6387 }
6388 
6389 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6390                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6391   if(VM_Version::supports_gfni()) {
6392     // Galois field instruction based bit reversal based on following algorithm.
6393     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6394     mov64(rtmp1, 0x8040201008040201L);
6395     movq(xtmp1, src);
6396     movq(xtmp2, rtmp1);
6397     gf2p8affineqb(xtmp1, xtmp2, 0);
6398     movq(dst, xtmp1);
6399   } else {
6400     // Swap even and odd numbered bits.
6401     movq(rtmp1, src);
6402     mov64(rtmp2, 0x5555555555555555L);
6403     andq(rtmp1, rtmp2);
6404     shlq(rtmp1, 1);
6405     movq(dst, src);
6406     notq(rtmp2);
6407     andq(dst, rtmp2);
6408     shrq(dst, 1);
6409     orq(dst, rtmp1);
6410 
6411     // Swap LSB and MSB 2 bits of each nibble.
6412     movq(rtmp1, dst);
6413     mov64(rtmp2, 0x3333333333333333L);
6414     andq(rtmp1, rtmp2);
6415     shlq(rtmp1, 2);
6416     notq(rtmp2);
6417     andq(dst, rtmp2);
6418     shrq(dst, 2);
6419     orq(dst, rtmp1);
6420 
6421     // Swap LSB and MSB 4 bits of each byte.
6422     movq(rtmp1, dst);
6423     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6424     andq(rtmp1, rtmp2);
6425     shlq(rtmp1, 4);
6426     notq(rtmp2);
6427     andq(dst, rtmp2);
6428     shrq(dst, 4);
6429     orq(dst, rtmp1);
6430   }
6431   bswapq(dst);
6432 }
6433 
6434 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6435   Label done;
6436   Label neg_divisor_fastpath;
6437   cmpq(divisor, 0);
6438   jccb(Assembler::less, neg_divisor_fastpath);
6439   xorl(rdx, rdx);
6440   divq(divisor);
6441   jmpb(done);
6442   bind(neg_divisor_fastpath);
6443   // Fastpath for divisor < 0:
6444   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6445   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6446   movq(rdx, rax);
6447   subq(rdx, divisor);
6448   if (VM_Version::supports_bmi1()) {
6449     andnq(rax, rdx, rax);
6450   } else {
6451     notq(rdx);
6452     andq(rax, rdx);
6453   }
6454   shrq(rax, 63);
6455   bind(done);
6456 }
6457 
6458 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6459   Label done;
6460   Label neg_divisor_fastpath;
6461   cmpq(divisor, 0);
6462   jccb(Assembler::less, neg_divisor_fastpath);
6463   xorq(rdx, rdx);
6464   divq(divisor);
6465   jmp(done);
6466   bind(neg_divisor_fastpath);
6467   // Fastpath when divisor < 0:
6468   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6469   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6470   movq(rdx, rax);
6471   subq(rax, divisor);
6472   if (VM_Version::supports_bmi1()) {
6473     andnq(rax, rax, rdx);
6474   } else {
6475     notq(rax);
6476     andq(rax, rdx);
6477   }
6478   sarq(rax, 63);
6479   andq(rax, divisor);
6480   subq(rdx, rax);
6481   bind(done);
6482 }
6483 
6484 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6485   Label done;
6486   Label neg_divisor_fastpath;
6487   cmpq(divisor, 0);
6488   jccb(Assembler::less, neg_divisor_fastpath);
6489   xorq(rdx, rdx);
6490   divq(divisor);
6491   jmp(done);
6492   bind(neg_divisor_fastpath);
6493   // Fastpath for divisor < 0:
6494   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6495   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6496   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6497   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6498   movq(rdx, rax);
6499   subq(rax, divisor);
6500   if (VM_Version::supports_bmi1()) {
6501     andnq(rax, rax, rdx);
6502   } else {
6503     notq(rax);
6504     andq(rax, rdx);
6505   }
6506   movq(tmp, rax);
6507   shrq(rax, 63); // quotient
6508   sarq(tmp, 63);
6509   andq(tmp, divisor);
6510   subq(rdx, tmp); // remainder
6511   bind(done);
6512 }
6513 
6514 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6515                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6516                                         int vlen_enc) {
6517   assert(VM_Version::supports_avx512bw(), "");
6518   // Byte shuffles are inlane operations and indices are determined using
6519   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6520   // normalized to index range 0-15. This makes sure that all the multiples
6521   // of an index value are placed at same relative position in 128 bit
6522   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6523   // will be 16th element in their respective 128 bit lanes.
6524   movl(rtmp, 16);
6525   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6526 
6527   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6528   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6529   // original shuffle indices and move the shuffled lanes corresponding to true
6530   // mask to destination vector.
6531   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6532   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6533   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6534 
6535   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6536   // and broadcasting second 128 bit lane.
6537   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6538   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6539   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6540   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6541   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6542 
6543   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6544   // and broadcasting third 128 bit lane.
6545   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6546   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6547   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6548   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6549   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6550 
6551   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6552   // and broadcasting third 128 bit lane.
6553   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6554   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6555   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6556   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6557   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6558 }
6559 
6560 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6561                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6562   if (vlen_enc == AVX_128bit) {
6563     vpermilps(dst, src, shuffle, vlen_enc);
6564   } else if (bt == T_INT) {
6565     vpermd(dst, shuffle, src, vlen_enc);
6566   } else {
6567     assert(bt == T_FLOAT, "");
6568     vpermps(dst, shuffle, src, vlen_enc);
6569   }
6570 }
6571 
6572 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6573   switch(opcode) {
6574     case Op_AddHF: vaddsh(dst, src1, src2); break;
6575     case Op_SubHF: vsubsh(dst, src1, src2); break;
6576     case Op_MulHF: vmulsh(dst, src1, src2); break;
6577     case Op_DivHF: vdivsh(dst, src1, src2); break;
6578     default: assert(false, "%s", NodeClassNames[opcode]); break;
6579   }
6580 }
6581 
6582 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6583   switch(elem_bt) {
6584     case T_BYTE:
6585       if (ideal_opc == Op_SaturatingAddV) {
6586         vpaddsb(dst, src1, src2, vlen_enc);
6587       } else {
6588         assert(ideal_opc == Op_SaturatingSubV, "");
6589         vpsubsb(dst, src1, src2, vlen_enc);
6590       }
6591       break;
6592     case T_SHORT:
6593       if (ideal_opc == Op_SaturatingAddV) {
6594         vpaddsw(dst, src1, src2, vlen_enc);
6595       } else {
6596         assert(ideal_opc == Op_SaturatingSubV, "");
6597         vpsubsw(dst, src1, src2, vlen_enc);
6598       }
6599       break;
6600     default:
6601       fatal("Unsupported type %s", type2name(elem_bt));
6602       break;
6603   }
6604 }
6605 
6606 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6607   switch(elem_bt) {
6608     case T_BYTE:
6609       if (ideal_opc == Op_SaturatingAddV) {
6610         vpaddusb(dst, src1, src2, vlen_enc);
6611       } else {
6612         assert(ideal_opc == Op_SaturatingSubV, "");
6613         vpsubusb(dst, src1, src2, vlen_enc);
6614       }
6615       break;
6616     case T_SHORT:
6617       if (ideal_opc == Op_SaturatingAddV) {
6618         vpaddusw(dst, src1, src2, vlen_enc);
6619       } else {
6620         assert(ideal_opc == Op_SaturatingSubV, "");
6621         vpsubusw(dst, src1, src2, vlen_enc);
6622       }
6623       break;
6624     default:
6625       fatal("Unsupported type %s", type2name(elem_bt));
6626       break;
6627   }
6628 }
6629 
6630 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6631                                                               XMMRegister src2, KRegister ktmp, int vlen_enc) {
6632   // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6633   // overflow_mask = Inp1 <u Inp2
6634   evpcmpu(elem_bt, ktmp,  src2, src1, Assembler::lt, vlen_enc);
6635   // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6636   evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6637 }
6638 
6639 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6640                                                               XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6641   // Emulate unsigned comparison using signed comparison
6642   // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6643   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6644   vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6645   vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6646 
6647   vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6648 
6649   // Res = INP1 - INP2 (non-commutative and non-associative)
6650   vpsub(elem_bt, dst, src1, src2, vlen_enc);
6651   // Res = Mask ? Zero : Res
6652   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6653   vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6654 }
6655 
6656 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6657                                                                XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6658   // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6659   // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6660   // Res = Signed Add INP1, INP2
6661   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6662   // T1 = SRC1 | SRC2
6663   vpor(xtmp1, src1, src2, vlen_enc);
6664   // Max_Unsigned = -1
6665   vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6666   // Unsigned compare:  Mask = Res <u T1
6667   evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6668   // res  = Mask ? Max_Unsigned : Res
6669   evpblend(elem_bt, dst, ktmp,  dst, xtmp2, true, vlen_enc);
6670 }
6671 
6672 //
6673 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6674 // unsigned addition operation.
6675 //    overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6676 //
6677 // We empirically determined its semantic equivalence to following reduced expression
6678 //    overflow_mask =  (a + b) <u (a | b)
6679 //
6680 // and also verified it though Alive2 solver.
6681 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6682 //
6683 
6684 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6685                                                               XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6686   // Res = Signed Add INP1, INP2
6687   vpadd(elem_bt, dst, src1, src2, vlen_enc);
6688   // Compute T1 = INP1 | INP2
6689   vpor(xtmp3, src1, src2, vlen_enc);
6690   // T1 = Minimum signed value.
6691   vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6692   // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6693   vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6694   // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6695   vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6696   // Compute overflow detection mask = Res<1> <s T1
6697   if (elem_bt == T_INT) {
6698     vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6699   } else {
6700     assert(elem_bt == T_LONG, "");
6701     vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6702   }
6703   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6704 }
6705 
6706 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6707                                       int vlen_enc, bool xtmp2_hold_M1) {
6708   if (VM_Version::supports_avx512dq()) {
6709     evpmovq2m(ktmp, src, vlen_enc);
6710   } else {
6711     assert(VM_Version::supports_evex(), "");
6712     if (!xtmp2_hold_M1) {
6713       vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6714     }
6715     evpsraq(xtmp1, src, 63, vlen_enc);
6716     evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6717   }
6718 }
6719 
6720 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6721                                       int vlen_enc, bool xtmp2_hold_M1) {
6722   if (VM_Version::supports_avx512dq()) {
6723     evpmovd2m(ktmp, src, vlen_enc);
6724   } else {
6725     assert(VM_Version::supports_evex(), "");
6726     if (!xtmp2_hold_M1) {
6727       vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6728     }
6729     vpsrad(xtmp1, src, 31, vlen_enc);
6730     Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6731   }
6732 }
6733 
6734 
6735 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6736   if (elem_bt == T_LONG) {
6737     if (VM_Version::supports_evex()) {
6738       evpsraq(dst, src, 63, vlen_enc);
6739     } else {
6740       vpsrad(dst, src, 31, vlen_enc);
6741       vpshufd(dst, dst, 0xF5, vlen_enc);
6742     }
6743   } else {
6744     assert(elem_bt == T_INT, "");
6745     vpsrad(dst, src, 31, vlen_enc);
6746   }
6747 }
6748 
6749 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6750   if (compute_allones) {
6751     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6752       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6753     } else {
6754       vpcmpeqq(allones, allones, allones, vlen_enc);
6755     }
6756   }
6757   if (elem_bt == T_LONG) {
6758     vpsrlq(dst, allones, 1, vlen_enc);
6759   } else {
6760     assert(elem_bt == T_INT, "");
6761     vpsrld(dst, allones, 1, vlen_enc);
6762   }
6763 }
6764 
6765 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6766   if (compute_allones) {
6767     if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6768       vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6769     } else {
6770       vpcmpeqq(allones, allones, allones, vlen_enc);
6771     }
6772   }
6773   if (elem_bt == T_LONG) {
6774     vpsllq(dst, allones, 63, vlen_enc);
6775   } else {
6776     assert(elem_bt == T_INT, "");
6777     vpslld(dst, allones, 31, vlen_enc);
6778   }
6779 }
6780 
6781 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask,  XMMRegister src1, XMMRegister src2,
6782                                 Assembler::ComparisonPredicate cond, int vlen_enc) {
6783   switch(elem_bt) {
6784     case T_LONG:  evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6785     case T_INT:   evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6786     case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6787     case T_BYTE:  evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6788     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6789   }
6790 }
6791 
6792 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6793   switch(elem_bt) {
6794     case  T_LONG:  vpcmpgtq(dst, src1, src2, vlen_enc); break;
6795     case  T_INT:   vpcmpgtd(dst, src1, src2, vlen_enc); break;
6796     case  T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6797     case  T_BYTE:  vpcmpgtb(dst, src1, src2, vlen_enc); break;
6798     default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6799   }
6800 }
6801 
6802 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6803                                            XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6804   if (elem_bt == T_LONG) {
6805     evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6806   } else {
6807     assert(elem_bt == T_INT, "");
6808     evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6809   }
6810 }
6811 
6812 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6813                                                          XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6814                                                          KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6815   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6816   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6817   // Overflow detection based on Hacker's delight section 2-13.
6818   if (ideal_opc == Op_SaturatingAddV) {
6819     // res = src1 + src2
6820     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6821     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6822     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6823     vpxor(xtmp1, dst, src1, vlen_enc);
6824     vpxor(xtmp2, dst, src2, vlen_enc);
6825     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6826   } else {
6827     assert(ideal_opc == Op_SaturatingSubV, "");
6828     // res = src1 - src2
6829     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6830     // Overflow occurs when both inputs have opposite polarity and
6831     // result polarity does not comply with first input polarity.
6832     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6833     vpxor(xtmp1, src1, src2, vlen_enc);
6834     vpxor(xtmp2, dst, src1, vlen_enc);
6835     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6836   }
6837 
6838   // Compute overflow detection mask.
6839   evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6840   // Note: xtmp1 hold -1 in all its lanes after above call.
6841 
6842   // Compute mask based on first input polarity.
6843   evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6844 
6845   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6846   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6847 
6848   // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6849   // set bits in first input polarity mask holds a min value.
6850   evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6851   // Blend destination lanes with saturated values using overflow detection mask.
6852   evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6853 }
6854 
6855 
6856 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6857                                                         XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6858                                                         XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6859   assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6860   // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6861   // Overflow detection based on Hacker's delight section 2-13.
6862   if (ideal_opc == Op_SaturatingAddV) {
6863     // res = src1 + src2
6864     vpadd(elem_bt, dst, src1, src2, vlen_enc);
6865     // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6866     // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6867     vpxor(xtmp1, dst, src1, vlen_enc);
6868     vpxor(xtmp2, dst, src2, vlen_enc);
6869     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6870   } else {
6871     assert(ideal_opc == Op_SaturatingSubV, "");
6872     // res = src1 - src2
6873     vpsub(elem_bt, dst, src1, src2, vlen_enc);
6874     // Overflow occurs when both inputs have opposite polarity and
6875     // result polarity does not comply with first input polarity.
6876     // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6877     vpxor(xtmp1, src1, src2, vlen_enc);
6878     vpxor(xtmp2, dst, src1, vlen_enc);
6879     vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6880   }
6881 
6882   // Sign-extend to compute overflow detection mask.
6883   vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6884 
6885   vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6886   vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6887   vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6888 
6889   // Compose saturating min/max vector using first input polarity mask.
6890   vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
6891   vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
6892 
6893   // Blend result with saturating vector using overflow detection mask.
6894   vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6895 }
6896 
6897 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6898   switch(elem_bt) {
6899     case T_BYTE:
6900       if (ideal_opc == Op_SaturatingAddV) {
6901         vpaddsb(dst, src1, src2, vlen_enc);
6902       } else {
6903         assert(ideal_opc == Op_SaturatingSubV, "");
6904         vpsubsb(dst, src1, src2, vlen_enc);
6905       }
6906       break;
6907     case T_SHORT:
6908       if (ideal_opc == Op_SaturatingAddV) {
6909         vpaddsw(dst, src1, src2, vlen_enc);
6910       } else {
6911         assert(ideal_opc == Op_SaturatingSubV, "");
6912         vpsubsw(dst, src1, src2, vlen_enc);
6913       }
6914       break;
6915     default:
6916       fatal("Unsupported type %s", type2name(elem_bt));
6917       break;
6918   }
6919 }
6920 
6921 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6922   switch(elem_bt) {
6923     case T_BYTE:
6924       if (ideal_opc == Op_SaturatingAddV) {
6925         vpaddusb(dst, src1, src2, vlen_enc);
6926       } else {
6927         assert(ideal_opc == Op_SaturatingSubV, "");
6928         vpsubusb(dst, src1, src2, vlen_enc);
6929       }
6930       break;
6931     case T_SHORT:
6932       if (ideal_opc == Op_SaturatingAddV) {
6933         vpaddusw(dst, src1, src2, vlen_enc);
6934       } else {
6935         assert(ideal_opc == Op_SaturatingSubV, "");
6936         vpsubusw(dst, src1, src2, vlen_enc);
6937       }
6938       break;
6939     default:
6940       fatal("Unsupported type %s", type2name(elem_bt));
6941       break;
6942   }
6943 }
6944 
6945 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6946                                                      XMMRegister src2, int vlen_enc) {
6947   switch(elem_bt) {
6948     case T_BYTE:
6949       evpermi2b(dst, src1, src2, vlen_enc);
6950       break;
6951     case T_SHORT:
6952       evpermi2w(dst, src1, src2, vlen_enc);
6953       break;
6954     case T_INT:
6955       evpermi2d(dst, src1, src2, vlen_enc);
6956       break;
6957     case T_LONG:
6958       evpermi2q(dst, src1, src2, vlen_enc);
6959       break;
6960     case T_FLOAT:
6961       evpermi2ps(dst, src1, src2, vlen_enc);
6962       break;
6963     case T_DOUBLE:
6964       evpermi2pd(dst, src1, src2, vlen_enc);
6965       break;
6966     default:
6967       fatal("Unsupported type %s", type2name(elem_bt));
6968       break;
6969   }
6970 }
6971 
6972 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
6973   if (is_unsigned) {
6974     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6975   } else {
6976     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6977   }
6978 }
6979 
6980 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
6981   if (is_unsigned) {
6982     vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6983   } else {
6984     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6985   }
6986 }
6987 
6988 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6989   switch(opcode) {
6990     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
6991     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
6992     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
6993     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
6994     default: assert(false, "%s", NodeClassNames[opcode]); break;
6995   }
6996 }
6997 
6998 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6999   switch(opcode) {
7000     case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7001     case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7002     case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7003     case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7004     default: assert(false, "%s", NodeClassNames[opcode]); break;
7005   }
7006 }
7007 
7008 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7009                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
7010   vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
7011 }
7012 
7013 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7014                                             KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
7015   if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
7016     // Move sign bits of src2 to mask register.
7017     evpmovw2m(ktmp, src2, vlen_enc);
7018     // xtmp1 = src2 < 0 ? src2 : src1
7019     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7020     // xtmp2 = src2 < 0 ? ? src1 : src2
7021     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7022     // Idea behind above swapping is to make seconds source operand a +ve value.
7023     // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7024     // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7025     // the second source operand, either a NaN or a valid floating-point value, is returned
7026     // dst = max(xtmp1, xtmp2)
7027     evmaxph(dst, xtmp1, xtmp2, vlen_enc);
7028     // isNaN = is_unordered_quiet(xtmp1)
7029     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7030     // Final result is same as first source if its a NaN value,
7031     // in case second operand holds a NaN value then as per above semantics
7032     // result is same as second operand.
7033     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7034   } else {
7035     assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
7036     // Move sign bits of src1 to mask register.
7037     evpmovw2m(ktmp, src1, vlen_enc);
7038     // xtmp1 = src1 < 0 ? src2 : src1
7039     evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7040     // xtmp2 = src1 < 0 ? src1 : src2
7041     evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7042     // Idea behind above swapping is to make seconds source operand a -ve value.
7043     // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7044     // the second source operand is returned.
7045     // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7046     // or a valid floating-point value, is written to the result.
7047     // dst = min(xtmp1, xtmp2)
7048     evminph(dst, xtmp1, xtmp2, vlen_enc);
7049     // isNaN = is_unordered_quiet(xtmp1)
7050     evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7051     // Final result is same as first source if its a NaN value,
7052     // in case second operand holds a NaN value then as per above semantics
7053     // result is same as second operand.
7054     Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7055   }
7056 }
--- EOF ---