1 /*
   2  * Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "oops/methodData.hpp"
  29 #include "opto/c2_MacroAssembler.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/opcodes.hpp"
  32 #include "opto/subnode.hpp"
  33 #include "runtime/biasedLocking.hpp"
  34 #include "runtime/objectMonitor.hpp"
  35 #include "runtime/stubRoutines.hpp"
  36 
  37 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
  38   switch (vlen_in_bytes) {
  39     case  4: // fall-through
  40     case  8: // fall-through
  41     case 16: return Assembler::AVX_128bit;
  42     case 32: return Assembler::AVX_256bit;
  43     case 64: return Assembler::AVX_512bit;
  44 
  45     default: {
  46       ShouldNotReachHere();
  47       return Assembler::AVX_NoVec;
  48     }
  49   }
  50 }
  51 
  52 void C2_MacroAssembler::setvectmask(Register dst, Register src, KRegister mask) {
  53   guarantee(PostLoopMultiversioning, "must be");
  54   Assembler::movl(dst, 1);
  55   Assembler::shlxl(dst, dst, src);
  56   Assembler::decl(dst);
  57   Assembler::kmovdl(mask, dst);
  58   Assembler::movl(dst, src);
  59 }
  60 
  61 void C2_MacroAssembler::restorevectmask(KRegister mask) {
  62   guarantee(PostLoopMultiversioning, "must be");
  63   Assembler::knotwl(mask, k0);
  64 }
  65 
  66 #if INCLUDE_RTM_OPT
  67 
  68 // Update rtm_counters based on abort status
  69 // input: abort_status
  70 //        rtm_counters (RTMLockingCounters*)
  71 // flags are killed
  72 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
  73 
  74   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
  75   if (PrintPreciseRTMLockingStatistics) {
  76     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
  77       Label check_abort;
  78       testl(abort_status, (1<<i));
  79       jccb(Assembler::equal, check_abort);
  80       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
  81       bind(check_abort);
  82     }
  83   }
  84 }
  85 
  86 // Branch if (random & (count-1) != 0), count is 2^n
  87 // tmp, scr and flags are killed
  88 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
  89   assert(tmp == rax, "");
  90   assert(scr == rdx, "");
  91   rdtsc(); // modifies EDX:EAX
  92   andptr(tmp, count-1);
  93   jccb(Assembler::notZero, brLabel);
  94 }
  95 
  96 // Perform abort ratio calculation, set no_rtm bit if high ratio
  97 // input:  rtm_counters_Reg (RTMLockingCounters* address)
  98 // tmpReg, rtm_counters_Reg and flags are killed
  99 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
 100                                                     Register rtm_counters_Reg,
 101                                                     RTMLockingCounters* rtm_counters,
 102                                                     Metadata* method_data) {
 103   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 104 
 105   if (RTMLockingCalculationDelay > 0) {
 106     // Delay calculation
 107     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
 108     testptr(tmpReg, tmpReg);
 109     jccb(Assembler::equal, L_done);
 110   }
 111   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 112   //   Aborted transactions = abort_count * 100
 113   //   All transactions = total_count *  RTMTotalCountIncrRate
 114   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 115 
 116   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 117   cmpptr(tmpReg, RTMAbortThreshold);
 118   jccb(Assembler::below, L_check_always_rtm2);
 119   imulptr(tmpReg, tmpReg, 100);
 120 
 121   Register scrReg = rtm_counters_Reg;
 122   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 123   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 124   imulptr(scrReg, scrReg, RTMAbortRatio);
 125   cmpptr(tmpReg, scrReg);
 126   jccb(Assembler::below, L_check_always_rtm1);
 127   if (method_data != NULL) {
 128     // set rtm_state to "no rtm" in MDO
 129     mov_metadata(tmpReg, method_data);
 130     lock();
 131     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
 132   }
 133   jmpb(L_done);
 134   bind(L_check_always_rtm1);
 135   // Reload RTMLockingCounters* address
 136   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 137   bind(L_check_always_rtm2);
 138   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 139   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 140   jccb(Assembler::below, L_done);
 141   if (method_data != NULL) {
 142     // set rtm_state to "always rtm" in MDO
 143     mov_metadata(tmpReg, method_data);
 144     lock();
 145     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
 146   }
 147   bind(L_done);
 148 }
 149 
 150 // Update counters and perform abort ratio calculation
 151 // input:  abort_status_Reg
 152 // rtm_counters_Reg, flags are killed
 153 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 154                                       Register rtm_counters_Reg,
 155                                       RTMLockingCounters* rtm_counters,
 156                                       Metadata* method_data,
 157                                       bool profile_rtm) {
 158 
 159   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 160   // update rtm counters based on rax value at abort
 161   // reads abort_status_Reg, updates flags
 162   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 163   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 164   if (profile_rtm) {
 165     // Save abort status because abort_status_Reg is used by following code.
 166     if (RTMRetryCount > 0) {
 167       push(abort_status_Reg);
 168     }
 169     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 170     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 171     // restore abort status
 172     if (RTMRetryCount > 0) {
 173       pop(abort_status_Reg);
 174     }
 175   }
 176 }
 177 
 178 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 179 // inputs: retry_count_Reg
 180 //       : abort_status_Reg
 181 // output: retry_count_Reg decremented by 1
 182 // flags are killed
 183 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 184   Label doneRetry;
 185   assert(abort_status_Reg == rax, "");
 186   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 187   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 188   // if reason is in 0x6 and retry count != 0 then retry
 189   andptr(abort_status_Reg, 0x6);
 190   jccb(Assembler::zero, doneRetry);
 191   testl(retry_count_Reg, retry_count_Reg);
 192   jccb(Assembler::zero, doneRetry);
 193   pause();
 194   decrementl(retry_count_Reg);
 195   jmp(retryLabel);
 196   bind(doneRetry);
 197 }
 198 
 199 // Spin and retry if lock is busy,
 200 // inputs: box_Reg (monitor address)
 201 //       : retry_count_Reg
 202 // output: retry_count_Reg decremented by 1
 203 //       : clear z flag if retry count exceeded
 204 // tmp_Reg, scr_Reg, flags are killed
 205 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 206                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 207   Label SpinLoop, SpinExit, doneRetry;
 208   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 209 
 210   testl(retry_count_Reg, retry_count_Reg);
 211   jccb(Assembler::zero, doneRetry);
 212   decrementl(retry_count_Reg);
 213   movptr(scr_Reg, RTMSpinLoopCount);
 214 
 215   bind(SpinLoop);
 216   pause();
 217   decrementl(scr_Reg);
 218   jccb(Assembler::lessEqual, SpinExit);
 219   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 220   testptr(tmp_Reg, tmp_Reg);
 221   jccb(Assembler::notZero, SpinLoop);
 222 
 223   bind(SpinExit);
 224   jmp(retryLabel);
 225   bind(doneRetry);
 226   incrementl(retry_count_Reg); // clear z flag
 227 }
 228 
 229 // Use RTM for normal stack locks
 230 // Input: objReg (object to lock)
 231 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 232                                          Register retry_on_abort_count_Reg,
 233                                          RTMLockingCounters* stack_rtm_counters,
 234                                          Metadata* method_data, bool profile_rtm,
 235                                          Label& DONE_LABEL, Label& IsInflated) {
 236   assert(UseRTMForStackLocks, "why call this otherwise?");
 237   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
 238   assert(tmpReg == rax, "");
 239   assert(scrReg == rdx, "");
 240   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 241 
 242   if (RTMRetryCount > 0) {
 243     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 244     bind(L_rtm_retry);
 245   }
 246   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 247   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral|biased
 248   jcc(Assembler::notZero, IsInflated);
 249 
 250   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 251     Label L_noincrement;
 252     if (RTMTotalCountIncrRate > 1) {
 253       // tmpReg, scrReg and flags are killed
 254       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 255     }
 256     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
 257     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 258     bind(L_noincrement);
 259   }
 260   xbegin(L_on_abort);
 261   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 262   andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits
 263   cmpptr(tmpReg, markWord::unlocked_value);            // bits = 001 unlocked
 264   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 265 
 266   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 267   if (UseRTMXendForLockBusy) {
 268     xend();
 269     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 270     jmp(L_decrement_retry);
 271   }
 272   else {
 273     xabort(0);
 274   }
 275   bind(L_on_abort);
 276   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 277     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 278   }
 279   bind(L_decrement_retry);
 280   if (RTMRetryCount > 0) {
 281     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 282     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 283   }
 284 }
 285 
 286 // Use RTM for inflating locks
 287 // inputs: objReg (object to lock)
 288 //         boxReg (on-stack box address (displaced header location) - KILLED)
 289 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 290 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 291                                             Register scrReg, Register retry_on_busy_count_Reg,
 292                                             Register retry_on_abort_count_Reg,
 293                                             RTMLockingCounters* rtm_counters,
 294                                             Metadata* method_data, bool profile_rtm,
 295                                             Label& DONE_LABEL) {
 296   assert(UseRTMLocking, "why call this otherwise?");
 297   assert(tmpReg == rax, "");
 298   assert(scrReg == rdx, "");
 299   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 300   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 301 
 302   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 303   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
 304   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 305 
 306   if (RTMRetryCount > 0) {
 307     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 308     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 309     bind(L_rtm_retry);
 310   }
 311   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 312     Label L_noincrement;
 313     if (RTMTotalCountIncrRate > 1) {
 314       // tmpReg, scrReg and flags are killed
 315       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 316     }
 317     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 318     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 319     bind(L_noincrement);
 320   }
 321   xbegin(L_on_abort);
 322   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 323   movptr(tmpReg, Address(tmpReg, owner_offset));
 324   testptr(tmpReg, tmpReg);
 325   jcc(Assembler::zero, DONE_LABEL);
 326   if (UseRTMXendForLockBusy) {
 327     xend();
 328     jmp(L_decrement_retry);
 329   }
 330   else {
 331     xabort(0);
 332   }
 333   bind(L_on_abort);
 334   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 335   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 336     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 337   }
 338   if (RTMRetryCount > 0) {
 339     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 340     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 341   }
 342 
 343   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 344   testptr(tmpReg, tmpReg) ;
 345   jccb(Assembler::notZero, L_decrement_retry) ;
 346 
 347   // Appears unlocked - try to swing _owner from null to non-null.
 348   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 349 #ifdef _LP64
 350   Register threadReg = r15_thread;
 351 #else
 352   get_thread(scrReg);
 353   Register threadReg = scrReg;
 354 #endif
 355   lock();
 356   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 357 
 358   if (RTMRetryCount > 0) {
 359     // success done else retry
 360     jccb(Assembler::equal, DONE_LABEL) ;
 361     bind(L_decrement_retry);
 362     // Spin and retry if lock is busy.
 363     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 364   }
 365   else {
 366     bind(L_decrement_retry);
 367   }
 368 }
 369 
 370 #endif //  INCLUDE_RTM_OPT
 371 
 372 // fast_lock and fast_unlock used by C2
 373 
 374 // Because the transitions from emitted code to the runtime
 375 // monitorenter/exit helper stubs are so slow it's critical that
 376 // we inline both the stack-locking fast path and the inflated fast path.
 377 //
 378 // See also: cmpFastLock and cmpFastUnlock.
 379 //
 380 // What follows is a specialized inline transliteration of the code
 381 // in enter() and exit(). If we're concerned about I$ bloat another
 382 // option would be to emit TrySlowEnter and TrySlowExit methods
 383 // at startup-time.  These methods would accept arguments as
 384 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 385 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 386 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 387 // In practice, however, the # of lock sites is bounded and is usually small.
 388 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 389 // if the processor uses simple bimodal branch predictors keyed by EIP
 390 // Since the helper routines would be called from multiple synchronization
 391 // sites.
 392 //
 393 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 394 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 395 // to those specialized methods.  That'd give us a mostly platform-independent
 396 // implementation that the JITs could optimize and inline at their pleasure.
 397 // Done correctly, the only time we'd need to cross to native could would be
 398 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 399 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 400 // (b) explicit barriers or fence operations.
 401 //
 402 // TODO:
 403 //
 404 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 405 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 406 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 407 //    the lock operators would typically be faster than reifying Self.
 408 //
 409 // *  Ideally I'd define the primitives as:
 410 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 411 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 412 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 413 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 414 //    Furthermore the register assignments are overconstrained, possibly resulting in
 415 //    sub-optimal code near the synchronization site.
 416 //
 417 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 418 //    Alternately, use a better sp-proximity test.
 419 //
 420 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 421 //    Either one is sufficient to uniquely identify a thread.
 422 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 423 //
 424 // *  Intrinsify notify() and notifyAll() for the common cases where the
 425 //    object is locked by the calling thread but the waitlist is empty.
 426 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 427 //
 428 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 429 //    But beware of excessive branch density on AMD Opterons.
 430 //
 431 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 432 //    or failure of the fast path.  If the fast path fails then we pass
 433 //    control to the slow path, typically in C.  In fast_lock and
 434 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 435 //    will emit a conditional branch immediately after the node.
 436 //    So we have branches to branches and lots of ICC.ZF games.
 437 //    Instead, it might be better to have C2 pass a "FailureLabel"
 438 //    into fast_lock and fast_unlock.  In the case of success, control
 439 //    will drop through the node.  ICC.ZF is undefined at exit.
 440 //    In the case of failure, the node will branch directly to the
 441 //    FailureLabel
 442 
 443 
 444 // obj: object to lock
 445 // box: on-stack box address (displaced header location) - KILLED
 446 // rax,: tmp -- KILLED
 447 // scr: tmp -- KILLED
 448 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 449                                  Register scrReg, Register cx1Reg, Register cx2Reg,
 450                                  BiasedLockingCounters* counters,
 451                                  RTMLockingCounters* rtm_counters,
 452                                  RTMLockingCounters* stack_rtm_counters,
 453                                  Metadata* method_data,
 454                                  bool use_rtm, bool profile_rtm) {
 455   // Ensure the register assignments are disjoint
 456   assert(tmpReg == rax, "");
 457 
 458   if (use_rtm) {
 459     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 460   } else {
 461     assert(cx2Reg == noreg, "");
 462     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 463   }
 464 
 465   if (counters != NULL) {
 466     atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
 467   }
 468 
 469   // Possible cases that we'll encounter in fast_lock
 470   // ------------------------------------------------
 471   // * Inflated
 472   //    -- unlocked
 473   //    -- Locked
 474   //       = by self
 475   //       = by other
 476   // * biased
 477   //    -- by Self
 478   //    -- by other
 479   // * neutral
 480   // * stack-locked
 481   //    -- by self
 482   //       = sp-proximity test hits
 483   //       = sp-proximity test generates false-negative
 484   //    -- by other
 485   //
 486 
 487   Label IsInflated, DONE_LABEL;
 488 
 489   if (DiagnoseSyncOnValueBasedClasses != 0) {
 490     load_klass(tmpReg, objReg, cx1Reg);
 491     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 492     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 493     jcc(Assembler::notZero, DONE_LABEL);
 494   }
 495 
 496   // it's stack-locked, biased or neutral
 497   // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
 498   // order to reduce the number of conditional branches in the most common cases.
 499   // Beware -- there's a subtle invariant that fetch of the markword
 500   // at [FETCH], below, will never observe a biased encoding (*101b).
 501   // If this invariant is not held we risk exclusion (safety) failure.
 502   if (UseBiasedLocking && !UseOptoBiasInlining) {
 503     biased_locking_enter(boxReg, objReg, tmpReg, scrReg, cx1Reg, false, DONE_LABEL, NULL, counters);
 504   }
 505 
 506 #if INCLUDE_RTM_OPT
 507   if (UseRTMForStackLocks && use_rtm) {
 508     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 509                       stack_rtm_counters, method_data, profile_rtm,
 510                       DONE_LABEL, IsInflated);
 511   }
 512 #endif // INCLUDE_RTM_OPT
 513 
 514   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 515   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased
 516   jccb(Assembler::notZero, IsInflated);
 517 
 518   // Attempt stack-locking ...
 519   orptr (tmpReg, markWord::unlocked_value);
 520   movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 521   lock();
 522   cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 523   if (counters != NULL) {
 524     cond_inc32(Assembler::equal,
 525                ExternalAddress((address)counters->fast_path_entry_count_addr()));
 526   }
 527   jcc(Assembler::equal, DONE_LABEL);           // Success
 528 
 529   // Recursive locking.
 530   // The object is stack-locked: markword contains stack pointer to BasicLock.
 531   // Locked by current thread if difference with current SP is less than one page.
 532   subptr(tmpReg, rsp);
 533   // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 534   andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
 535   movptr(Address(boxReg, 0), tmpReg);
 536   if (counters != NULL) {
 537     cond_inc32(Assembler::equal,
 538                ExternalAddress((address)counters->fast_path_entry_count_addr()));
 539   }
 540   jmp(DONE_LABEL);
 541 
 542   bind(IsInflated);
 543   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 544 
 545 #if INCLUDE_RTM_OPT
 546   // Use the same RTM locking code in 32- and 64-bit VM.
 547   if (use_rtm) {
 548     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 549                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 550   } else {
 551 #endif // INCLUDE_RTM_OPT
 552 
 553 #ifndef _LP64
 554   // The object is inflated.
 555 
 556   // boxReg refers to the on-stack BasicLock in the current frame.
 557   // We'd like to write:
 558   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 559   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 560   // additional latency as we have another ST in the store buffer that must drain.
 561 
 562   // avoid ST-before-CAS
 563   // register juggle because we need tmpReg for cmpxchgptr below
 564   movptr(scrReg, boxReg);
 565   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 566 
 567   // Optimistic form: consider XORL tmpReg,tmpReg
 568   movptr(tmpReg, NULL_WORD);
 569 
 570   // Appears unlocked - try to swing _owner from null to non-null.
 571   // Ideally, I'd manifest "Self" with get_thread and then attempt
 572   // to CAS the register containing Self into m->Owner.
 573   // But we don't have enough registers, so instead we can either try to CAS
 574   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 575   // we later store "Self" into m->Owner.  Transiently storing a stack address
 576   // (rsp or the address of the box) into  m->owner is harmless.
 577   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 578   lock();
 579   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 580   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 581   // If we weren't able to swing _owner from NULL to the BasicLock
 582   // then take the slow path.
 583   jccb  (Assembler::notZero, DONE_LABEL);
 584   // update _owner from BasicLock to thread
 585   get_thread (scrReg);                    // beware: clobbers ICCs
 586   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 587   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 588 
 589   // If the CAS fails we can either retry or pass control to the slow path.
 590   // We use the latter tactic.
 591   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 592   // If the CAS was successful ...
 593   //   Self has acquired the lock
 594   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 595   // Intentional fall-through into DONE_LABEL ...
 596 #else // _LP64
 597   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 598   movq(scrReg, tmpReg);
 599   xorq(tmpReg, tmpReg);
 600   lock();
 601   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 602   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 603   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 604   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
 605   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 606   jcc(Assembler::equal, DONE_LABEL);           // CAS above succeeded; propagate ZF = 1 (success)
 607 
 608   cmpptr(r15_thread, rax);                     // Check if we are already the owner (recursive lock)
 609   jcc(Assembler::notEqual, DONE_LABEL);        // If not recursive, ZF = 0 at this point (fail)
 610   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 611   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 612 #endif // _LP64
 613 #if INCLUDE_RTM_OPT
 614   } // use_rtm()
 615 #endif
 616   // DONE_LABEL is a hot target - we'd really like to place it at the
 617   // start of cache line by padding with NOPs.
 618   // See the AMD and Intel software optimization manuals for the
 619   // most efficient "long" NOP encodings.
 620   // Unfortunately none of our alignment mechanisms suffice.
 621   bind(DONE_LABEL);
 622 
 623   // At DONE_LABEL the icc ZFlag is set as follows ...
 624   // fast_unlock uses the same protocol.
 625   // ZFlag == 1 -> Success
 626   // ZFlag == 0 -> Failure - force control through the slow path
 627 }
 628 
 629 // obj: object to unlock
 630 // box: box address (displaced header location), killed.  Must be EAX.
 631 // tmp: killed, cannot be obj nor box.
 632 //
 633 // Some commentary on balanced locking:
 634 //
 635 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 636 // Methods that don't have provably balanced locking are forced to run in the
 637 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 638 // The interpreter provides two properties:
 639 // I1:  At return-time the interpreter automatically and quietly unlocks any
 640 //      objects acquired the current activation (frame).  Recall that the
 641 //      interpreter maintains an on-stack list of locks currently held by
 642 //      a frame.
 643 // I2:  If a method attempts to unlock an object that is not held by the
 644 //      the frame the interpreter throws IMSX.
 645 //
 646 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 647 // B() doesn't have provably balanced locking so it runs in the interpreter.
 648 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 649 // is still locked by A().
 650 //
 651 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 652 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 653 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 654 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 655 // Arguably given that the spec legislates the JNI case as undefined our implementation
 656 // could reasonably *avoid* checking owner in fast_unlock().
 657 // In the interest of performance we elide m->Owner==Self check in unlock.
 658 // A perfectly viable alternative is to elide the owner check except when
 659 // Xcheck:jni is enabled.
 660 
 661 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 662   assert(boxReg == rax, "");
 663   assert_different_registers(objReg, boxReg, tmpReg);
 664 
 665   Label DONE_LABEL, Stacked, CheckSucc;
 666 
 667   // Critically, the biased locking test must have precedence over
 668   // and appear before the (box->dhw == 0) recursive stack-lock test.
 669   if (UseBiasedLocking && !UseOptoBiasInlining) {
 670     biased_locking_exit(objReg, tmpReg, DONE_LABEL);
 671   }
 672 
 673 #if INCLUDE_RTM_OPT
 674   if (UseRTMForStackLocks && use_rtm) {
 675     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
 676     Label L_regular_unlock;
 677     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 678     andptr(tmpReg, markWord::biased_lock_mask_in_place);              // look at 3 lock bits
 679     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 001 unlocked
 680     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 681     xend();                                                           // otherwise end...
 682     jmp(DONE_LABEL);                                                  // ... and we're done
 683     bind(L_regular_unlock);
 684   }
 685 #endif
 686 
 687   cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD);                   // Examine the displaced header
 688   jcc   (Assembler::zero, DONE_LABEL);                              // 0 indicates recursive stack-lock
 689   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
 690   testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 691   jccb  (Assembler::zero, Stacked);
 692 
 693   // It's inflated.
 694 #if INCLUDE_RTM_OPT
 695   if (use_rtm) {
 696     Label L_regular_inflated_unlock;
 697     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 698     movptr(boxReg, Address(tmpReg, owner_offset));
 699     testptr(boxReg, boxReg);
 700     jccb(Assembler::notZero, L_regular_inflated_unlock);
 701     xend();
 702     jmpb(DONE_LABEL);
 703     bind(L_regular_inflated_unlock);
 704   }
 705 #endif
 706 
 707   // Despite our balanced locking property we still check that m->_owner == Self
 708   // as java routines or native JNI code called by this thread might
 709   // have released the lock.
 710   // Refer to the comments in synchronizer.cpp for how we might encode extra
 711   // state in _succ so we can avoid fetching EntryList|cxq.
 712   //
 713   // If there's no contention try a 1-0 exit.  That is, exit without
 714   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 715   // we detect and recover from the race that the 1-0 exit admits.
 716   //
 717   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 718   // before it STs null into _owner, releasing the lock.  Updates
 719   // to data protected by the critical section must be visible before
 720   // we drop the lock (and thus before any other thread could acquire
 721   // the lock and observe the fields protected by the lock).
 722   // IA32's memory-model is SPO, so STs are ordered with respect to
 723   // each other and there's no need for an explicit barrier (fence).
 724   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 725 #ifndef _LP64
 726   get_thread (boxReg);
 727 
 728   // Note that we could employ various encoding schemes to reduce
 729   // the number of loads below (currently 4) to just 2 or 3.
 730   // Refer to the comments in synchronizer.cpp.
 731   // In practice the chain of fetches doesn't seem to impact performance, however.
 732   xorptr(boxReg, boxReg);
 733   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 734   jccb  (Assembler::notZero, DONE_LABEL);
 735   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 736   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 737   jccb  (Assembler::notZero, CheckSucc);
 738   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 739   jmpb  (DONE_LABEL);
 740 
 741   bind (Stacked);
 742   // It's not inflated and it's not recursively stack-locked and it's not biased.
 743   // It must be stack-locked.
 744   // Try to reset the header to displaced header.
 745   // The "box" value on the stack is stable, so we can reload
 746   // and be assured we observe the same value as above.
 747   movptr(tmpReg, Address(boxReg, 0));
 748   lock();
 749   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 750   // Intention fall-thru into DONE_LABEL
 751 
 752   // DONE_LABEL is a hot target - we'd really like to place it at the
 753   // start of cache line by padding with NOPs.
 754   // See the AMD and Intel software optimization manuals for the
 755   // most efficient "long" NOP encodings.
 756   // Unfortunately none of our alignment mechanisms suffice.
 757   bind (CheckSucc);
 758 #else // _LP64
 759   // It's inflated
 760   Label LNotRecursive, LSuccess, LGoSlowPath;
 761 
 762   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 763   jccb(Assembler::equal, LNotRecursive);
 764 
 765   // Recursive inflated unlock
 766   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 767   jmpb(LSuccess);
 768 
 769   bind(LNotRecursive);
 770   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 771   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 772   jccb  (Assembler::notZero, CheckSucc);
 773   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 774   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 775   jmpb  (DONE_LABEL);
 776 
 777   // Try to avoid passing control into the slow_path ...
 778   bind  (CheckSucc);
 779 
 780   // The following optional optimization can be elided if necessary
 781   // Effectively: if (succ == null) goto slow path
 782   // The code reduces the window for a race, however,
 783   // and thus benefits performance.
 784   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 785   jccb  (Assembler::zero, LGoSlowPath);
 786 
 787   xorptr(boxReg, boxReg);
 788   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 789   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 790 
 791   // Memory barrier/fence
 792   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 793   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 794   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 795   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 796   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 797   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 798   lock(); addl(Address(rsp, 0), 0);
 799 
 800   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 801   jccb  (Assembler::notZero, LSuccess);
 802 
 803   // Rare inopportune interleaving - race.
 804   // The successor vanished in the small window above.
 805   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 806   // We need to ensure progress and succession.
 807   // Try to reacquire the lock.
 808   // If that fails then the new owner is responsible for succession and this
 809   // thread needs to take no further action and can exit via the fast path (success).
 810   // If the re-acquire succeeds then pass control into the slow path.
 811   // As implemented, this latter mode is horrible because we generated more
 812   // coherence traffic on the lock *and* artifically extended the critical section
 813   // length while by virtue of passing control into the slow path.
 814 
 815   // box is really RAX -- the following CMPXCHG depends on that binding
 816   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 817   lock();
 818   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 819   // There's no successor so we tried to regrab the lock.
 820   // If that didn't work, then another thread grabbed the
 821   // lock so we're done (and exit was a success).
 822   jccb  (Assembler::notEqual, LSuccess);
 823   // Intentional fall-through into slow path
 824 
 825   bind  (LGoSlowPath);
 826   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 827   jmpb  (DONE_LABEL);
 828 
 829   bind  (LSuccess);
 830   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 831   jmpb  (DONE_LABEL);
 832 
 833   bind  (Stacked);
 834   movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 835   lock();
 836   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 837 
 838 #endif
 839   bind(DONE_LABEL);
 840 }
 841 
 842 //-------------------------------------------------------------------------------------------
 843 // Generic instructions support for use in .ad files C2 code generation
 844 
 845 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
 846   if (dst != src) {
 847     movdqu(dst, src);
 848   }
 849   if (opcode == Op_AbsVD) {
 850     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr);
 851   } else {
 852     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 853     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr);
 854   }
 855 }
 856 
 857 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
 858   if (opcode == Op_AbsVD) {
 859     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr);
 860   } else {
 861     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 862     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr);
 863   }
 864 }
 865 
 866 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
 867   if (dst != src) {
 868     movdqu(dst, src);
 869   }
 870   if (opcode == Op_AbsVF) {
 871     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);
 872   } else {
 873     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 874     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);
 875   }
 876 }
 877 
 878 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
 879   if (opcode == Op_AbsVF) {
 880     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);
 881   } else {
 882     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 883     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
 884   }
 885 }
 886 
 887 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 888   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 889   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 890 
 891   if (opcode == Op_MinV) {
 892     if (elem_bt == T_BYTE) {
 893       pminsb(dst, src);
 894     } else if (elem_bt == T_SHORT) {
 895       pminsw(dst, src);
 896     } else if (elem_bt == T_INT) {
 897       pminsd(dst, src);
 898     } else {
 899       assert(elem_bt == T_LONG, "required");
 900       assert(tmp == xmm0, "required");
 901       assert_different_registers(dst, src, tmp);
 902       movdqu(xmm0, dst);
 903       pcmpgtq(xmm0, src);
 904       blendvpd(dst, src);  // xmm0 as mask
 905     }
 906   } else { // opcode == Op_MaxV
 907     if (elem_bt == T_BYTE) {
 908       pmaxsb(dst, src);
 909     } else if (elem_bt == T_SHORT) {
 910       pmaxsw(dst, src);
 911     } else if (elem_bt == T_INT) {
 912       pmaxsd(dst, src);
 913     } else {
 914       assert(elem_bt == T_LONG, "required");
 915       assert(tmp == xmm0, "required");
 916       assert_different_registers(dst, src, tmp);
 917       movdqu(xmm0, src);
 918       pcmpgtq(xmm0, dst);
 919       blendvpd(dst, src);  // xmm0 as mask
 920     }
 921   }
 922 }
 923 
 924 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 925                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 926                                  int vlen_enc) {
 927   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 928 
 929   if (opcode == Op_MinV) {
 930     if (elem_bt == T_BYTE) {
 931       vpminsb(dst, src1, src2, vlen_enc);
 932     } else if (elem_bt == T_SHORT) {
 933       vpminsw(dst, src1, src2, vlen_enc);
 934     } else if (elem_bt == T_INT) {
 935       vpminsd(dst, src1, src2, vlen_enc);
 936     } else {
 937       assert(elem_bt == T_LONG, "required");
 938       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 939         vpminsq(dst, src1, src2, vlen_enc);
 940       } else {
 941         assert_different_registers(dst, src1, src2);
 942         vpcmpgtq(dst, src1, src2, vlen_enc);
 943         vblendvpd(dst, src1, src2, dst, vlen_enc);
 944       }
 945     }
 946   } else { // opcode == Op_MaxV
 947     if (elem_bt == T_BYTE) {
 948       vpmaxsb(dst, src1, src2, vlen_enc);
 949     } else if (elem_bt == T_SHORT) {
 950       vpmaxsw(dst, src1, src2, vlen_enc);
 951     } else if (elem_bt == T_INT) {
 952       vpmaxsd(dst, src1, src2, vlen_enc);
 953     } else {
 954       assert(elem_bt == T_LONG, "required");
 955       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 956         vpmaxsq(dst, src1, src2, vlen_enc);
 957       } else {
 958         assert_different_registers(dst, src1, src2);
 959         vpcmpgtq(dst, src1, src2, vlen_enc);
 960         vblendvpd(dst, src2, src1, dst, vlen_enc);
 961       }
 962     }
 963   }
 964 }
 965 
 966 // Float/Double min max
 967 
 968 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 969                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 970                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 971                                    int vlen_enc) {
 972   assert(UseAVX > 0, "required");
 973   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 974          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 975   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 976   assert_different_registers(a, b, tmp, atmp, btmp);
 977 
 978   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 979   bool is_double_word = is_double_word_type(elem_bt);
 980 
 981   if (!is_double_word && is_min) {
 982     vblendvps(atmp, a, b, a, vlen_enc);
 983     vblendvps(btmp, b, a, a, vlen_enc);
 984     vminps(tmp, atmp, btmp, vlen_enc);
 985     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 986     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
 987   } else if (!is_double_word && !is_min) {
 988     vblendvps(btmp, b, a, b, vlen_enc);
 989     vblendvps(atmp, a, b, b, vlen_enc);
 990     vmaxps(tmp, atmp, btmp, vlen_enc);
 991     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 992     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
 993   } else if (is_double_word && is_min) {
 994     vblendvpd(atmp, a, b, a, vlen_enc);
 995     vblendvpd(btmp, b, a, a, vlen_enc);
 996     vminpd(tmp, atmp, btmp, vlen_enc);
 997     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 998     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
 999   } else {
1000     assert(is_double_word && !is_min, "sanity");
1001     vblendvpd(btmp, b, a, b, vlen_enc);
1002     vblendvpd(atmp, a, b, b, vlen_enc);
1003     vmaxpd(tmp, atmp, btmp, vlen_enc);
1004     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1005     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1006   }
1007 }
1008 
1009 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1010                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1011                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1012                                     int vlen_enc) {
1013   assert(UseAVX > 2, "required");
1014   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1015          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1016   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1017   assert_different_registers(dst, a, b, atmp, btmp);
1018 
1019   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1020   bool is_double_word = is_double_word_type(elem_bt);
1021   bool merge = true;
1022 
1023   if (!is_double_word && is_min) {
1024     evpmovd2m(ktmp, a, vlen_enc);
1025     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1026     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1027     vminps(dst, atmp, btmp, vlen_enc);
1028     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1029     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1030   } else if (!is_double_word && !is_min) {
1031     evpmovd2m(ktmp, b, vlen_enc);
1032     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1033     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1034     vmaxps(dst, atmp, btmp, vlen_enc);
1035     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1036     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1037   } else if (is_double_word && is_min) {
1038     evpmovq2m(ktmp, a, vlen_enc);
1039     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1040     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1041     vminpd(dst, atmp, btmp, vlen_enc);
1042     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1043     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1044   } else {
1045     assert(is_double_word && !is_min, "sanity");
1046     evpmovq2m(ktmp, b, vlen_enc);
1047     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1048     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1049     vmaxpd(dst, atmp, btmp, vlen_enc);
1050     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1051     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1052   }
1053 }
1054 
1055 // Float/Double signum
1056 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst,
1057                                   XMMRegister zero, XMMRegister one,
1058                                   Register scratch) {
1059   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1060 
1061   Label DONE_LABEL;
1062 
1063   if (opcode == Op_SignumF) {
1064     assert(UseSSE > 0, "required");
1065     ucomiss(dst, zero);
1066     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1067     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1068     movflt(dst, one);
1069     jcc(Assembler::above, DONE_LABEL);
1070     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scratch);
1071   } else if (opcode == Op_SignumD) {
1072     assert(UseSSE > 1, "required");
1073     ucomisd(dst, zero);
1074     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1075     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1076     movdbl(dst, one);
1077     jcc(Assembler::above, DONE_LABEL);
1078     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scratch);
1079   }
1080 
1081   bind(DONE_LABEL);
1082 }
1083 
1084 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1085   if (sign) {
1086     pmovsxbw(dst, src);
1087   } else {
1088     pmovzxbw(dst, src);
1089   }
1090 }
1091 
1092 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1093   if (sign) {
1094     vpmovsxbw(dst, src, vector_len);
1095   } else {
1096     vpmovzxbw(dst, src, vector_len);
1097   }
1098 }
1099 
1100 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1101   if (sign) {
1102     vpmovsxbd(dst, src, vector_len);
1103   } else {
1104     vpmovzxbd(dst, src, vector_len);
1105   }
1106 }
1107 
1108 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1109   if (sign) {
1110     vpmovsxwd(dst, src, vector_len);
1111   } else {
1112     vpmovzxwd(dst, src, vector_len);
1113   }
1114 }
1115 
1116 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1117                                      int shift, int vector_len) {
1118   if (opcode == Op_RotateLeftV) {
1119     if (etype == T_INT) {
1120       evprold(dst, src, shift, vector_len);
1121     } else {
1122       assert(etype == T_LONG, "expected type T_LONG");
1123       evprolq(dst, src, shift, vector_len);
1124     }
1125   } else {
1126     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1127     if (etype == T_INT) {
1128       evprord(dst, src, shift, vector_len);
1129     } else {
1130       assert(etype == T_LONG, "expected type T_LONG");
1131       evprorq(dst, src, shift, vector_len);
1132     }
1133   }
1134 }
1135 
1136 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1137                                      XMMRegister shift, int vector_len) {
1138   if (opcode == Op_RotateLeftV) {
1139     if (etype == T_INT) {
1140       evprolvd(dst, src, shift, vector_len);
1141     } else {
1142       assert(etype == T_LONG, "expected type T_LONG");
1143       evprolvq(dst, src, shift, vector_len);
1144     }
1145   } else {
1146     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1147     if (etype == T_INT) {
1148       evprorvd(dst, src, shift, vector_len);
1149     } else {
1150       assert(etype == T_LONG, "expected type T_LONG");
1151       evprorvq(dst, src, shift, vector_len);
1152     }
1153   }
1154 }
1155 
1156 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1157   if (opcode == Op_RShiftVI) {
1158     psrad(dst, shift);
1159   } else if (opcode == Op_LShiftVI) {
1160     pslld(dst, shift);
1161   } else {
1162     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1163     psrld(dst, shift);
1164   }
1165 }
1166 
1167 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1168   switch (opcode) {
1169     case Op_RShiftVI:  psrad(dst, shift); break;
1170     case Op_LShiftVI:  pslld(dst, shift); break;
1171     case Op_URShiftVI: psrld(dst, shift); break;
1172 
1173     default: assert(false, "%s", NodeClassNames[opcode]);
1174   }
1175 }
1176 
1177 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1178   if (opcode == Op_RShiftVI) {
1179     vpsrad(dst, nds, shift, vector_len);
1180   } else if (opcode == Op_LShiftVI) {
1181     vpslld(dst, nds, shift, vector_len);
1182   } else {
1183     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1184     vpsrld(dst, nds, shift, vector_len);
1185   }
1186 }
1187 
1188 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1189   switch (opcode) {
1190     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1191     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1192     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1193 
1194     default: assert(false, "%s", NodeClassNames[opcode]);
1195   }
1196 }
1197 
1198 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1199   switch (opcode) {
1200     case Op_RShiftVB:  // fall-through
1201     case Op_RShiftVS:  psraw(dst, shift); break;
1202 
1203     case Op_LShiftVB:  // fall-through
1204     case Op_LShiftVS:  psllw(dst, shift);   break;
1205 
1206     case Op_URShiftVS: // fall-through
1207     case Op_URShiftVB: psrlw(dst, shift);  break;
1208 
1209     default: assert(false, "%s", NodeClassNames[opcode]);
1210   }
1211 }
1212 
1213 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1214   switch (opcode) {
1215     case Op_RShiftVB:  // fall-through
1216     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1217 
1218     case Op_LShiftVB:  // fall-through
1219     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1220 
1221     case Op_URShiftVS: // fall-through
1222     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1223 
1224     default: assert(false, "%s", NodeClassNames[opcode]);
1225   }
1226 }
1227 
1228 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1229   switch (opcode) {
1230     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1231     case Op_LShiftVL:  psllq(dst, shift); break;
1232     case Op_URShiftVL: psrlq(dst, shift); break;
1233 
1234     default: assert(false, "%s", NodeClassNames[opcode]);
1235   }
1236 }
1237 
1238 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1239   if (opcode == Op_RShiftVL) {
1240     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1241   } else if (opcode == Op_LShiftVL) {
1242     psllq(dst, shift);
1243   } else {
1244     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1245     psrlq(dst, shift);
1246   }
1247 }
1248 
1249 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1250   switch (opcode) {
1251     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1252     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1253     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1254 
1255     default: assert(false, "%s", NodeClassNames[opcode]);
1256   }
1257 }
1258 
1259 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1260   if (opcode == Op_RShiftVL) {
1261     evpsraq(dst, nds, shift, vector_len);
1262   } else if (opcode == Op_LShiftVL) {
1263     vpsllq(dst, nds, shift, vector_len);
1264   } else {
1265     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1266     vpsrlq(dst, nds, shift, vector_len);
1267   }
1268 }
1269 
1270 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1271   switch (opcode) {
1272     case Op_RShiftVB:  // fall-through
1273     case Op_RShiftVS:  // fall-through
1274     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1275 
1276     case Op_LShiftVB:  // fall-through
1277     case Op_LShiftVS:  // fall-through
1278     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1279 
1280     case Op_URShiftVB: // fall-through
1281     case Op_URShiftVS: // fall-through
1282     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1283 
1284     default: assert(false, "%s", NodeClassNames[opcode]);
1285   }
1286 }
1287 
1288 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1289   switch (opcode) {
1290     case Op_RShiftVB:  // fall-through
1291     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1292 
1293     case Op_LShiftVB:  // fall-through
1294     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1295 
1296     case Op_URShiftVB: // fall-through
1297     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1298 
1299     default: assert(false, "%s", NodeClassNames[opcode]);
1300   }
1301 }
1302 
1303 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1304   assert(UseAVX >= 2, "required");
1305   switch (opcode) {
1306     case Op_RShiftVL: {
1307       if (UseAVX > 2) {
1308         assert(tmp == xnoreg, "not used");
1309         if (!VM_Version::supports_avx512vl()) {
1310           vlen_enc = Assembler::AVX_512bit;
1311         }
1312         evpsravq(dst, src, shift, vlen_enc);
1313       } else {
1314         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1315         vpsrlvq(dst, src, shift, vlen_enc);
1316         vpsrlvq(tmp, tmp, shift, vlen_enc);
1317         vpxor(dst, dst, tmp, vlen_enc);
1318         vpsubq(dst, dst, tmp, vlen_enc);
1319       }
1320       break;
1321     }
1322     case Op_LShiftVL: {
1323       assert(tmp == xnoreg, "not used");
1324       vpsllvq(dst, src, shift, vlen_enc);
1325       break;
1326     }
1327     case Op_URShiftVL: {
1328       assert(tmp == xnoreg, "not used");
1329       vpsrlvq(dst, src, shift, vlen_enc);
1330       break;
1331     }
1332     default: assert(false, "%s", NodeClassNames[opcode]);
1333   }
1334 }
1335 
1336 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1337 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1338   assert(opcode == Op_LShiftVB ||
1339          opcode == Op_RShiftVB ||
1340          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1341   bool sign = (opcode != Op_URShiftVB);
1342   assert(vector_len == 0, "required");
1343   vextendbd(sign, dst, src, 1);
1344   vpmovzxbd(vtmp, shift, 1);
1345   varshiftd(opcode, dst, dst, vtmp, 1);
1346   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch);
1347   vextracti128_high(vtmp, dst);
1348   vpackusdw(dst, dst, vtmp, 0);
1349 }
1350 
1351 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1352 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1353   assert(opcode == Op_LShiftVB ||
1354          opcode == Op_RShiftVB ||
1355          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1356   bool sign = (opcode != Op_URShiftVB);
1357   int ext_vector_len = vector_len + 1;
1358   vextendbw(sign, dst, src, ext_vector_len);
1359   vpmovzxbw(vtmp, shift, ext_vector_len);
1360   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1361   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch);
1362   if (vector_len == 0) {
1363     vextracti128_high(vtmp, dst);
1364     vpackuswb(dst, dst, vtmp, vector_len);
1365   } else {
1366     vextracti64x4_high(vtmp, dst);
1367     vpackuswb(dst, dst, vtmp, vector_len);
1368     vpermq(dst, dst, 0xD8, vector_len);
1369   }
1370 }
1371 
1372 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1373   switch(typ) {
1374     case T_BYTE:
1375       pinsrb(dst, val, idx);
1376       break;
1377     case T_SHORT:
1378       pinsrw(dst, val, idx);
1379       break;
1380     case T_INT:
1381       pinsrd(dst, val, idx);
1382       break;
1383     case T_LONG:
1384       pinsrq(dst, val, idx);
1385       break;
1386     default:
1387       assert(false,"Should not reach here.");
1388       break;
1389   }
1390 }
1391 
1392 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1393   switch(typ) {
1394     case T_BYTE:
1395       vpinsrb(dst, src, val, idx);
1396       break;
1397     case T_SHORT:
1398       vpinsrw(dst, src, val, idx);
1399       break;
1400     case T_INT:
1401       vpinsrd(dst, src, val, idx);
1402       break;
1403     case T_LONG:
1404       vpinsrq(dst, src, val, idx);
1405       break;
1406     default:
1407       assert(false,"Should not reach here.");
1408       break;
1409   }
1410 }
1411 
1412 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1413   switch(typ) {
1414     case T_INT:
1415       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1416       break;
1417     case T_FLOAT:
1418       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1419       break;
1420     case T_LONG:
1421       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1422       break;
1423     case T_DOUBLE:
1424       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1425       break;
1426     default:
1427       assert(false,"Should not reach here.");
1428       break;
1429   }
1430 }
1431 
1432 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1433   switch(typ) {
1434     case T_INT:
1435       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1436       break;
1437     case T_FLOAT:
1438       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1439       break;
1440     case T_LONG:
1441       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1442       break;
1443     case T_DOUBLE:
1444       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1445       break;
1446     default:
1447       assert(false,"Should not reach here.");
1448       break;
1449   }
1450 }
1451 
1452 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1453   switch(typ) {
1454     case T_INT:
1455       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1456       break;
1457     case T_FLOAT:
1458       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1459       break;
1460     case T_LONG:
1461       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1462       break;
1463     case T_DOUBLE:
1464       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1465       break;
1466     default:
1467       assert(false,"Should not reach here.");
1468       break;
1469   }
1470 }
1471 
1472 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1473   if (vlen_in_bytes <= 16) {
1474     pxor (dst, dst);
1475     psubb(dst, src);
1476     switch (elem_bt) {
1477       case T_BYTE:   /* nothing to do */ break;
1478       case T_SHORT:  pmovsxbw(dst, dst); break;
1479       case T_INT:    pmovsxbd(dst, dst); break;
1480       case T_FLOAT:  pmovsxbd(dst, dst); break;
1481       case T_LONG:   pmovsxbq(dst, dst); break;
1482       case T_DOUBLE: pmovsxbq(dst, dst); break;
1483 
1484       default: assert(false, "%s", type2name(elem_bt));
1485     }
1486   } else {
1487     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1488     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1489 
1490     vpxor (dst, dst, dst, vlen_enc);
1491     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1492 
1493     switch (elem_bt) {
1494       case T_BYTE:   /* nothing to do */            break;
1495       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1496       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1497       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1498       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1499       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1500 
1501       default: assert(false, "%s", type2name(elem_bt));
1502     }
1503   }
1504 }
1505 
1506 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) {
1507   ExternalAddress addr(StubRoutines::x86::vector_iota_indices());
1508   if (vlen_in_bytes == 4) {
1509     movdl(dst, addr);
1510   } else if (vlen_in_bytes == 8) {
1511     movq(dst, addr);
1512   } else if (vlen_in_bytes == 16) {
1513     movdqu(dst, addr, scratch);
1514   } else if (vlen_in_bytes == 32) {
1515     vmovdqu(dst, addr, scratch);
1516   } else {
1517     assert(vlen_in_bytes == 64, "%d", vlen_in_bytes);
1518     evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch);
1519   }
1520 }
1521 
1522 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1523 
1524 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1525   int vector_len = Assembler::AVX_128bit;
1526 
1527   switch (opcode) {
1528     case Op_AndReductionV:  pand(dst, src); break;
1529     case Op_OrReductionV:   por (dst, src); break;
1530     case Op_XorReductionV:  pxor(dst, src); break;
1531     case Op_MinReductionV:
1532       switch (typ) {
1533         case T_BYTE:        pminsb(dst, src); break;
1534         case T_SHORT:       pminsw(dst, src); break;
1535         case T_INT:         pminsd(dst, src); break;
1536         case T_LONG:        assert(UseAVX > 2, "required");
1537                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1538         default:            assert(false, "wrong type");
1539       }
1540       break;
1541     case Op_MaxReductionV:
1542       switch (typ) {
1543         case T_BYTE:        pmaxsb(dst, src); break;
1544         case T_SHORT:       pmaxsw(dst, src); break;
1545         case T_INT:         pmaxsd(dst, src); break;
1546         case T_LONG:        assert(UseAVX > 2, "required");
1547                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1548         default:            assert(false, "wrong type");
1549       }
1550       break;
1551     case Op_AddReductionVF: addss(dst, src); break;
1552     case Op_AddReductionVD: addsd(dst, src); break;
1553     case Op_AddReductionVI:
1554       switch (typ) {
1555         case T_BYTE:        paddb(dst, src); break;
1556         case T_SHORT:       paddw(dst, src); break;
1557         case T_INT:         paddd(dst, src); break;
1558         default:            assert(false, "wrong type");
1559       }
1560       break;
1561     case Op_AddReductionVL: paddq(dst, src); break;
1562     case Op_MulReductionVF: mulss(dst, src); break;
1563     case Op_MulReductionVD: mulsd(dst, src); break;
1564     case Op_MulReductionVI:
1565       switch (typ) {
1566         case T_SHORT:       pmullw(dst, src); break;
1567         case T_INT:         pmulld(dst, src); break;
1568         default:            assert(false, "wrong type");
1569       }
1570       break;
1571     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1572                             vpmullq(dst, dst, src, vector_len); break;
1573     default:                assert(false, "wrong opcode");
1574   }
1575 }
1576 
1577 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1578   int vector_len = Assembler::AVX_256bit;
1579 
1580   switch (opcode) {
1581     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1582     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1583     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1584     case Op_MinReductionV:
1585       switch (typ) {
1586         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1587         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1588         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1589         case T_LONG:        assert(UseAVX > 2, "required");
1590                             vpminsq(dst, src1, src2, vector_len); break;
1591         default:            assert(false, "wrong type");
1592       }
1593       break;
1594     case Op_MaxReductionV:
1595       switch (typ) {
1596         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1597         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1598         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1599         case T_LONG:        assert(UseAVX > 2, "required");
1600                             vpmaxsq(dst, src1, src2, vector_len); break;
1601         default:            assert(false, "wrong type");
1602       }
1603       break;
1604     case Op_AddReductionVI:
1605       switch (typ) {
1606         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1607         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1608         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1609         default:            assert(false, "wrong type");
1610       }
1611       break;
1612     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1613     case Op_MulReductionVI:
1614       switch (typ) {
1615         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1616         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1617         default:            assert(false, "wrong type");
1618       }
1619       break;
1620     case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
1621     default:                assert(false, "wrong opcode");
1622   }
1623 }
1624 
1625 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1626                                   XMMRegister dst, XMMRegister src,
1627                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1628   switch (opcode) {
1629     case Op_AddReductionVF:
1630     case Op_MulReductionVF:
1631       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1632       break;
1633 
1634     case Op_AddReductionVD:
1635     case Op_MulReductionVD:
1636       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1637       break;
1638 
1639     default: assert(false, "wrong opcode");
1640   }
1641 }
1642 
1643 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1644                              Register dst, Register src1, XMMRegister src2,
1645                              XMMRegister vtmp1, XMMRegister vtmp2) {
1646   switch (vlen) {
1647     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1648     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1649     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1650     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1651 
1652     default: assert(false, "wrong vector length");
1653   }
1654 }
1655 
1656 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1657                              Register dst, Register src1, XMMRegister src2,
1658                              XMMRegister vtmp1, XMMRegister vtmp2) {
1659   switch (vlen) {
1660     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1661     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1662     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1663     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1664 
1665     default: assert(false, "wrong vector length");
1666   }
1667 }
1668 
1669 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1670                              Register dst, Register src1, XMMRegister src2,
1671                              XMMRegister vtmp1, XMMRegister vtmp2) {
1672   switch (vlen) {
1673     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1674     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1675     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1676     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1677 
1678     default: assert(false, "wrong vector length");
1679   }
1680 }
1681 
1682 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1683                              Register dst, Register src1, XMMRegister src2,
1684                              XMMRegister vtmp1, XMMRegister vtmp2) {
1685   switch (vlen) {
1686     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1687     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1688     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1689     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1690 
1691     default: assert(false, "wrong vector length");
1692   }
1693 }
1694 
1695 #ifdef _LP64
1696 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1697                              Register dst, Register src1, XMMRegister src2,
1698                              XMMRegister vtmp1, XMMRegister vtmp2) {
1699   switch (vlen) {
1700     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1701     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1702     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1703 
1704     default: assert(false, "wrong vector length");
1705   }
1706 }
1707 #endif // _LP64
1708 
1709 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1710   switch (vlen) {
1711     case 2:
1712       assert(vtmp2 == xnoreg, "");
1713       reduce2F(opcode, dst, src, vtmp1);
1714       break;
1715     case 4:
1716       assert(vtmp2 == xnoreg, "");
1717       reduce4F(opcode, dst, src, vtmp1);
1718       break;
1719     case 8:
1720       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1721       break;
1722     case 16:
1723       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1724       break;
1725     default: assert(false, "wrong vector length");
1726   }
1727 }
1728 
1729 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1730   switch (vlen) {
1731     case 2:
1732       assert(vtmp2 == xnoreg, "");
1733       reduce2D(opcode, dst, src, vtmp1);
1734       break;
1735     case 4:
1736       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1737       break;
1738     case 8:
1739       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1740       break;
1741     default: assert(false, "wrong vector length");
1742   }
1743 }
1744 
1745 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1746   if (opcode == Op_AddReductionVI) {
1747     if (vtmp1 != src2) {
1748       movdqu(vtmp1, src2);
1749     }
1750     phaddd(vtmp1, vtmp1);
1751   } else {
1752     pshufd(vtmp1, src2, 0x1);
1753     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1754   }
1755   movdl(vtmp2, src1);
1756   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1757   movdl(dst, vtmp1);
1758 }
1759 
1760 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1761   if (opcode == Op_AddReductionVI) {
1762     if (vtmp1 != src2) {
1763       movdqu(vtmp1, src2);
1764     }
1765     phaddd(vtmp1, src2);
1766     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1767   } else {
1768     pshufd(vtmp2, src2, 0xE);
1769     reduce_operation_128(T_INT, opcode, vtmp2, src2);
1770     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1771   }
1772 }
1773 
1774 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1775   if (opcode == Op_AddReductionVI) {
1776     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1777     vextracti128_high(vtmp2, vtmp1);
1778     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1779     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1780   } else {
1781     vextracti128_high(vtmp1, src2);
1782     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1783     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1784   }
1785 }
1786 
1787 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1788   vextracti64x4_high(vtmp2, src2);
1789   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
1790   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1791 }
1792 
1793 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1794   pshufd(vtmp2, src2, 0x1);
1795   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1796   movdqu(vtmp1, vtmp2);
1797   psrldq(vtmp1, 2);
1798   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1799   movdqu(vtmp2, vtmp1);
1800   psrldq(vtmp2, 1);
1801   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1802   movdl(vtmp2, src1);
1803   pmovsxbd(vtmp1, vtmp1);
1804   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1805   pextrb(dst, vtmp1, 0x0);
1806   movsbl(dst, dst);
1807 }
1808 
1809 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1810   pshufd(vtmp1, src2, 0xE);
1811   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
1812   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1813 }
1814 
1815 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1816   vextracti128_high(vtmp2, src2);
1817   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1818   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1819 }
1820 
1821 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1822   vextracti64x4_high(vtmp1, src2);
1823   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
1824   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1825 }
1826 
1827 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1828   pmovsxbw(vtmp2, src2);
1829   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1830 }
1831 
1832 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1833   if (UseAVX > 1) {
1834     int vector_len = Assembler::AVX_256bit;
1835     vpmovsxbw(vtmp1, src2, vector_len);
1836     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1837   } else {
1838     pmovsxbw(vtmp2, src2);
1839     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1840     pshufd(vtmp2, src2, 0x1);
1841     pmovsxbw(vtmp2, src2);
1842     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1843   }
1844 }
1845 
1846 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1847   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
1848     int vector_len = Assembler::AVX_512bit;
1849     vpmovsxbw(vtmp1, src2, vector_len);
1850     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1851   } else {
1852     assert(UseAVX >= 2,"Should not reach here.");
1853     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
1854     vextracti128_high(vtmp2, src2);
1855     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1856   }
1857 }
1858 
1859 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1860   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
1861   vextracti64x4_high(vtmp2, src2);
1862   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1863 }
1864 
1865 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1866   if (opcode == Op_AddReductionVI) {
1867     if (vtmp1 != src2) {
1868       movdqu(vtmp1, src2);
1869     }
1870     phaddw(vtmp1, vtmp1);
1871     phaddw(vtmp1, vtmp1);
1872   } else {
1873     pshufd(vtmp2, src2, 0x1);
1874     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1875     movdqu(vtmp1, vtmp2);
1876     psrldq(vtmp1, 2);
1877     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
1878   }
1879   movdl(vtmp2, src1);
1880   pmovsxwd(vtmp1, vtmp1);
1881   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1882   pextrw(dst, vtmp1, 0x0);
1883   movswl(dst, dst);
1884 }
1885 
1886 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1887   if (opcode == Op_AddReductionVI) {
1888     if (vtmp1 != src2) {
1889       movdqu(vtmp1, src2);
1890     }
1891     phaddw(vtmp1, src2);
1892   } else {
1893     pshufd(vtmp1, src2, 0xE);
1894     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
1895   }
1896   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1897 }
1898 
1899 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1900   if (opcode == Op_AddReductionVI) {
1901     int vector_len = Assembler::AVX_256bit;
1902     vphaddw(vtmp2, src2, src2, vector_len);
1903     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
1904   } else {
1905     vextracti128_high(vtmp2, src2);
1906     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1907   }
1908   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1909 }
1910 
1911 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1912   int vector_len = Assembler::AVX_256bit;
1913   vextracti64x4_high(vtmp1, src2);
1914   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
1915   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1916 }
1917 
1918 #ifdef _LP64
1919 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1920   pshufd(vtmp2, src2, 0xE);
1921   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
1922   movdq(vtmp1, src1);
1923   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
1924   movdq(dst, vtmp1);
1925 }
1926 
1927 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1928   vextracti128_high(vtmp1, src2);
1929   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
1930   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1931 }
1932 
1933 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1934   vextracti64x4_high(vtmp2, src2);
1935   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
1936   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1937 }
1938 
1939 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
1940   assert(ArrayOperationPartialInlineSize > 0 && ArrayOperationPartialInlineSize <= 64, "invalid");
1941   mov64(temp, -1L);
1942   bzhiq(temp, temp, len);
1943   kmovql(dst, temp);
1944 }
1945 #endif // _LP64
1946 
1947 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1948   reduce_operation_128(T_FLOAT, opcode, dst, src);
1949   pshufd(vtmp, src, 0x1);
1950   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1951 }
1952 
1953 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1954   reduce2F(opcode, dst, src, vtmp);
1955   pshufd(vtmp, src, 0x2);
1956   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1957   pshufd(vtmp, src, 0x3);
1958   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1959 }
1960 
1961 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1962   reduce4F(opcode, dst, src, vtmp2);
1963   vextractf128_high(vtmp2, src);
1964   reduce4F(opcode, dst, vtmp2, vtmp1);
1965 }
1966 
1967 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1968   reduce8F(opcode, dst, src, vtmp1, vtmp2);
1969   vextracti64x4_high(vtmp1, src);
1970   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
1971 }
1972 
1973 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1974   reduce_operation_128(T_DOUBLE, opcode, dst, src);
1975   pshufd(vtmp, src, 0xE);
1976   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
1977 }
1978 
1979 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1980   reduce2D(opcode, dst, src, vtmp2);
1981   vextractf128_high(vtmp2, src);
1982   reduce2D(opcode, dst, vtmp2, vtmp1);
1983 }
1984 
1985 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1986   reduce4D(opcode, dst, src, vtmp1, vtmp2);
1987   vextracti64x4_high(vtmp1, src);
1988   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
1989 }
1990 
1991 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) {
1992   MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
1993 }
1994 
1995 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) {
1996   MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
1997 }
1998 
1999 
2000 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2001                                           XMMRegister dst, XMMRegister src,
2002                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2003                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2004   int permconst[] = {1, 14};
2005   XMMRegister wsrc = src;
2006   XMMRegister wdst = xmm_0;
2007   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2008 
2009   int vlen_enc = Assembler::AVX_128bit;
2010   if (vlen == 16) {
2011     vlen_enc = Assembler::AVX_256bit;
2012   }
2013 
2014   for (int i = log2(vlen) - 1; i >=0; i--) {
2015     if (i == 0 && !is_dst_valid) {
2016       wdst = dst;
2017     }
2018     if (i == 3) {
2019       vextracti64x4_high(wtmp, wsrc);
2020     } else if (i == 2) {
2021       vextracti128_high(wtmp, wsrc);
2022     } else { // i = [0,1]
2023       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2024     }
2025     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2026     wsrc = wdst;
2027     vlen_enc = Assembler::AVX_128bit;
2028   }
2029   if (is_dst_valid) {
2030     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2031   }
2032 }
2033 
2034 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2035                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2036                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2037   XMMRegister wsrc = src;
2038   XMMRegister wdst = xmm_0;
2039   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2040   int vlen_enc = Assembler::AVX_128bit;
2041   if (vlen == 8) {
2042     vlen_enc = Assembler::AVX_256bit;
2043   }
2044   for (int i = log2(vlen) - 1; i >=0; i--) {
2045     if (i == 0 && !is_dst_valid) {
2046       wdst = dst;
2047     }
2048     if (i == 1) {
2049       vextracti128_high(wtmp, wsrc);
2050     } else if (i == 2) {
2051       vextracti64x4_high(wtmp, wsrc);
2052     } else {
2053       assert(i == 0, "%d", i);
2054       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2055     }
2056     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2057     wsrc = wdst;
2058     vlen_enc = Assembler::AVX_128bit;
2059   }
2060   if (is_dst_valid) {
2061     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2062   }
2063 }
2064 
2065 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2066   switch (bt) {
2067     case T_BYTE:  pextrb(dst, src, idx); break;
2068     case T_SHORT: pextrw(dst, src, idx); break;
2069     case T_INT:   pextrd(dst, src, idx); break;
2070     case T_LONG:  pextrq(dst, src, idx); break;
2071 
2072     default:
2073       assert(false,"Should not reach here.");
2074       break;
2075   }
2076 }
2077 
2078 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2079   int esize =  type2aelembytes(typ);
2080   int elem_per_lane = 16/esize;
2081   int lane = elemindex / elem_per_lane;
2082   int eindex = elemindex % elem_per_lane;
2083 
2084   if (lane >= 2) {
2085     assert(UseAVX > 2, "required");
2086     vextractf32x4(dst, src, lane & 3);
2087     return dst;
2088   } else if (lane > 0) {
2089     assert(UseAVX > 0, "required");
2090     vextractf128(dst, src, lane);
2091     return dst;
2092   } else {
2093     return src;
2094   }
2095 }
2096 
2097 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2098   if (typ == T_BYTE) {
2099     movsbl(dst, dst);
2100   } else if (typ == T_SHORT) {
2101     movswl(dst, dst);
2102   }
2103 }
2104 
2105 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2106   int esize =  type2aelembytes(typ);
2107   int elem_per_lane = 16/esize;
2108   int eindex = elemindex % elem_per_lane;
2109   assert(is_integral_type(typ),"required");
2110 
2111   if (eindex == 0) {
2112     if (typ == T_LONG) {
2113       movq(dst, src);
2114     } else {
2115       movdl(dst, src);
2116       movsxl(typ, dst);
2117     }
2118   } else {
2119     extract(typ, dst, src, eindex);
2120     movsxl(typ, dst);
2121   }
2122 }
2123 
2124 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) {
2125   int esize =  type2aelembytes(typ);
2126   int elem_per_lane = 16/esize;
2127   int eindex = elemindex % elem_per_lane;
2128   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2129 
2130   if (eindex == 0) {
2131     movq(dst, src);
2132   } else {
2133     if (typ == T_FLOAT) {
2134       if (UseAVX == 0) {
2135         movdqu(dst, src);
2136         pshufps(dst, dst, eindex);
2137       } else {
2138         vpshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2139       }
2140     } else {
2141       if (UseAVX == 0) {
2142         movdqu(dst, src);
2143         psrldq(dst, eindex*esize);
2144       } else {
2145         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2146       }
2147       movq(dst, dst);
2148     }
2149   }
2150   // Zero upper bits
2151   if (typ == T_FLOAT) {
2152     if (UseAVX == 0) {
2153       assert((vtmp != xnoreg) && (tmp != noreg), "required.");
2154       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp);
2155       pand(dst, vtmp);
2156     } else {
2157       assert((tmp != noreg), "required.");
2158       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp);
2159     }
2160   }
2161 }
2162 
2163 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2164   switch(typ) {
2165     case T_BYTE:
2166     case T_BOOLEAN:
2167       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2168       break;
2169     case T_SHORT:
2170     case T_CHAR:
2171       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2172       break;
2173     case T_INT:
2174     case T_FLOAT:
2175       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2176       break;
2177     case T_LONG:
2178     case T_DOUBLE:
2179       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2180       break;
2181     default:
2182       assert(false,"Should not reach here.");
2183       break;
2184   }
2185 }
2186 
2187 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) {
2188   switch(typ) {
2189     case T_BOOLEAN:
2190     case T_BYTE:
2191       evpcmpb(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2192       break;
2193     case T_CHAR:
2194     case T_SHORT:
2195       evpcmpw(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2196       break;
2197     case T_INT:
2198     case T_FLOAT:
2199       evpcmpd(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2200       break;
2201     case T_LONG:
2202     case T_DOUBLE:
2203       evpcmpq(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2204       break;
2205     default:
2206       assert(false,"Should not reach here.");
2207       break;
2208   }
2209 }
2210 
2211 void C2_MacroAssembler::vpcmpu(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison,
2212                             int vlen_in_bytes, XMMRegister vtmp1, XMMRegister vtmp2, Register scratch) {
2213   int vlen_enc = vector_length_encoding(vlen_in_bytes*2);
2214   switch (typ) {
2215   case T_BYTE:
2216     vpmovzxbw(vtmp1, src1, vlen_enc);
2217     vpmovzxbw(vtmp2, src2, vlen_enc);
2218     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2219     vpacksswb(dst, dst, dst, vlen_enc);
2220     break;
2221   case T_SHORT:
2222     vpmovzxwd(vtmp1, src1, vlen_enc);
2223     vpmovzxwd(vtmp2, src2, vlen_enc);
2224     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);
2225     vpackssdw(dst, dst, dst, vlen_enc);
2226     break;
2227   case T_INT:
2228     vpmovzxdq(vtmp1, src1, vlen_enc);
2229     vpmovzxdq(vtmp2, src2, vlen_enc);
2230     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);
2231     vpermilps(dst, dst, 8, vlen_enc);
2232     break;
2233   default:
2234     assert(false, "Should not reach here");
2235   }
2236   if (vlen_in_bytes == 16) {
2237     vpermpd(dst, dst, 0x8, vlen_enc);
2238   }
2239 }
2240 
2241 void C2_MacroAssembler::vpcmpu32(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison, int vlen_in_bytes,
2242                               XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, Register scratch) {
2243   int vlen_enc = vector_length_encoding(vlen_in_bytes);
2244   switch (typ) {
2245   case T_BYTE:
2246     vpmovzxbw(vtmp1, src1, vlen_enc);
2247     vpmovzxbw(vtmp2, src2, vlen_enc);
2248     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2249     vextracti128(vtmp1, src1, 1);
2250     vextracti128(vtmp2, src2, 1);
2251     vpmovzxbw(vtmp1, vtmp1, vlen_enc);
2252     vpmovzxbw(vtmp2, vtmp2, vlen_enc);
2253     vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2254     vpacksswb(dst, dst, vtmp3, vlen_enc);
2255     vpermpd(dst, dst, 0xd8, vlen_enc);
2256     break;
2257   case T_SHORT:
2258     vpmovzxwd(vtmp1, src1, vlen_enc);
2259     vpmovzxwd(vtmp2, src2, vlen_enc);
2260     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);
2261     vextracti128(vtmp1, src1, 1);
2262     vextracti128(vtmp2, src2, 1);
2263     vpmovzxwd(vtmp1, vtmp1, vlen_enc);
2264     vpmovzxwd(vtmp2, vtmp2, vlen_enc);
2265     vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::D,  vlen_enc, scratch);
2266     vpackssdw(dst, dst, vtmp3, vlen_enc);
2267     vpermpd(dst, dst, 0xd8, vlen_enc);
2268     break;
2269   case T_INT:
2270     vpmovzxdq(vtmp1, src1, vlen_enc);
2271     vpmovzxdq(vtmp2, src2, vlen_enc);
2272     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);
2273     vpshufd(dst, dst, 8, vlen_enc);
2274     vpermq(dst, dst, 8, vlen_enc);
2275     vextracti128(vtmp1, src1, 1);
2276     vextracti128(vtmp2, src2, 1);
2277     vpmovzxdq(vtmp1, vtmp1, vlen_enc);
2278     vpmovzxdq(vtmp2, vtmp2, vlen_enc);
2279     vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::Q,  vlen_enc, scratch);
2280     vpshufd(vtmp3, vtmp3, 8, vlen_enc);
2281     vpermq(vtmp3, vtmp3, 0x80, vlen_enc);
2282     vpblendd(dst, dst, vtmp3, 0xf0, vlen_enc);
2283     break;
2284   default:
2285     assert(false, "Should not reach here");
2286   }
2287 }
2288 
2289 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2290   switch(typ) {
2291     case T_BYTE:
2292       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2293       break;
2294     case T_SHORT:
2295       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2296       break;
2297     case T_INT:
2298     case T_FLOAT:
2299       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2300       break;
2301     case T_LONG:
2302     case T_DOUBLE:
2303       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2304       break;
2305     default:
2306       assert(false,"Should not reach here.");
2307       break;
2308   }
2309 }
2310 
2311 void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2,
2312                                    XMMRegister vtmp1, XMMRegister vtmp2, KRegister mask) {
2313   switch(vlen) {
2314     case 4:
2315       assert(vtmp1 != xnoreg, "required.");
2316       // Broadcast lower 32 bits to 128 bits before ptest
2317       pshufd(vtmp1, src1, 0x0);
2318       if (bt == BoolTest::overflow) {
2319         assert(vtmp2 != xnoreg, "required.");
2320         pshufd(vtmp2, src2, 0x0);
2321       } else {
2322         assert(vtmp2 == xnoreg, "required.");
2323         vtmp2 = src2;
2324       }
2325       ptest(vtmp1, vtmp2);
2326      break;
2327     case 8:
2328       assert(vtmp1 != xnoreg, "required.");
2329       // Broadcast lower 64 bits to 128 bits before ptest
2330       pshufd(vtmp1, src1, 0x4);
2331       if (bt == BoolTest::overflow) {
2332         assert(vtmp2 != xnoreg, "required.");
2333         pshufd(vtmp2, src2, 0x4);
2334       } else {
2335         assert(vtmp2 == xnoreg, "required.");
2336         vtmp2 = src2;
2337       }
2338       ptest(vtmp1, vtmp2);
2339      break;
2340     case 16:
2341       assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2342       ptest(src1, src2);
2343       break;
2344     case 32:
2345       assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2346       vptest(src1, src2, Assembler::AVX_256bit);
2347       break;
2348     case 64:
2349       {
2350         assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2351         evpcmpeqb(mask, src1, src2, Assembler::AVX_512bit);
2352         if (bt == BoolTest::ne) {
2353           ktestql(mask, mask);
2354         } else {
2355           assert(bt == BoolTest::overflow, "required");
2356           kortestql(mask, mask);
2357         }
2358       }
2359       break;
2360     default:
2361       assert(false,"Should not reach here.");
2362       break;
2363   }
2364 }
2365 
2366 //-------------------------------------------------------------------------------------------
2367 
2368 // IndexOf for constant substrings with size >= 8 chars
2369 // which don't need to be loaded through stack.
2370 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2371                                          Register cnt1, Register cnt2,
2372                                          int int_cnt2,  Register result,
2373                                          XMMRegister vec, Register tmp,
2374                                          int ae) {
2375   ShortBranchVerifier sbv(this);
2376   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2377   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2378 
2379   // This method uses the pcmpestri instruction with bound registers
2380   //   inputs:
2381   //     xmm - substring
2382   //     rax - substring length (elements count)
2383   //     mem - scanned string
2384   //     rdx - string length (elements count)
2385   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2386   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2387   //   outputs:
2388   //     rcx - matched index in string
2389   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2390   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2391   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2392   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2393   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2394 
2395   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2396         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2397         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2398 
2399   // Note, inline_string_indexOf() generates checks:
2400   // if (substr.count > string.count) return -1;
2401   // if (substr.count == 0) return 0;
2402   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2403 
2404   // Load substring.
2405   if (ae == StrIntrinsicNode::UL) {
2406     pmovzxbw(vec, Address(str2, 0));
2407   } else {
2408     movdqu(vec, Address(str2, 0));
2409   }
2410   movl(cnt2, int_cnt2);
2411   movptr(result, str1); // string addr
2412 
2413   if (int_cnt2 > stride) {
2414     jmpb(SCAN_TO_SUBSTR);
2415 
2416     // Reload substr for rescan, this code
2417     // is executed only for large substrings (> 8 chars)
2418     bind(RELOAD_SUBSTR);
2419     if (ae == StrIntrinsicNode::UL) {
2420       pmovzxbw(vec, Address(str2, 0));
2421     } else {
2422       movdqu(vec, Address(str2, 0));
2423     }
2424     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2425 
2426     bind(RELOAD_STR);
2427     // We came here after the beginning of the substring was
2428     // matched but the rest of it was not so we need to search
2429     // again. Start from the next element after the previous match.
2430 
2431     // cnt2 is number of substring reminding elements and
2432     // cnt1 is number of string reminding elements when cmp failed.
2433     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2434     subl(cnt1, cnt2);
2435     addl(cnt1, int_cnt2);
2436     movl(cnt2, int_cnt2); // Now restore cnt2
2437 
2438     decrementl(cnt1);     // Shift to next element
2439     cmpl(cnt1, cnt2);
2440     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2441 
2442     addptr(result, (1<<scale1));
2443 
2444   } // (int_cnt2 > 8)
2445 
2446   // Scan string for start of substr in 16-byte vectors
2447   bind(SCAN_TO_SUBSTR);
2448   pcmpestri(vec, Address(result, 0), mode);
2449   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2450   subl(cnt1, stride);
2451   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2452   cmpl(cnt1, cnt2);
2453   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2454   addptr(result, 16);
2455   jmpb(SCAN_TO_SUBSTR);
2456 
2457   // Found a potential substr
2458   bind(FOUND_CANDIDATE);
2459   // Matched whole vector if first element matched (tmp(rcx) == 0).
2460   if (int_cnt2 == stride) {
2461     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2462   } else { // int_cnt2 > 8
2463     jccb(Assembler::overflow, FOUND_SUBSTR);
2464   }
2465   // After pcmpestri tmp(rcx) contains matched element index
2466   // Compute start addr of substr
2467   lea(result, Address(result, tmp, scale1));
2468 
2469   // Make sure string is still long enough
2470   subl(cnt1, tmp);
2471   cmpl(cnt1, cnt2);
2472   if (int_cnt2 == stride) {
2473     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2474   } else { // int_cnt2 > 8
2475     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2476   }
2477   // Left less then substring.
2478 
2479   bind(RET_NOT_FOUND);
2480   movl(result, -1);
2481   jmp(EXIT);
2482 
2483   if (int_cnt2 > stride) {
2484     // This code is optimized for the case when whole substring
2485     // is matched if its head is matched.
2486     bind(MATCH_SUBSTR_HEAD);
2487     pcmpestri(vec, Address(result, 0), mode);
2488     // Reload only string if does not match
2489     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2490 
2491     Label CONT_SCAN_SUBSTR;
2492     // Compare the rest of substring (> 8 chars).
2493     bind(FOUND_SUBSTR);
2494     // First 8 chars are already matched.
2495     negptr(cnt2);
2496     addptr(cnt2, stride);
2497 
2498     bind(SCAN_SUBSTR);
2499     subl(cnt1, stride);
2500     cmpl(cnt2, -stride); // Do not read beyond substring
2501     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2502     // Back-up strings to avoid reading beyond substring:
2503     // cnt1 = cnt1 - cnt2 + 8
2504     addl(cnt1, cnt2); // cnt2 is negative
2505     addl(cnt1, stride);
2506     movl(cnt2, stride); negptr(cnt2);
2507     bind(CONT_SCAN_SUBSTR);
2508     if (int_cnt2 < (int)G) {
2509       int tail_off1 = int_cnt2<<scale1;
2510       int tail_off2 = int_cnt2<<scale2;
2511       if (ae == StrIntrinsicNode::UL) {
2512         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2513       } else {
2514         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2515       }
2516       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2517     } else {
2518       // calculate index in register to avoid integer overflow (int_cnt2*2)
2519       movl(tmp, int_cnt2);
2520       addptr(tmp, cnt2);
2521       if (ae == StrIntrinsicNode::UL) {
2522         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2523       } else {
2524         movdqu(vec, Address(str2, tmp, scale2, 0));
2525       }
2526       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2527     }
2528     // Need to reload strings pointers if not matched whole vector
2529     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2530     addptr(cnt2, stride);
2531     jcc(Assembler::negative, SCAN_SUBSTR);
2532     // Fall through if found full substring
2533 
2534   } // (int_cnt2 > 8)
2535 
2536   bind(RET_FOUND);
2537   // Found result if we matched full small substring.
2538   // Compute substr offset
2539   subptr(result, str1);
2540   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2541     shrl(result, 1); // index
2542   }
2543   bind(EXIT);
2544 
2545 } // string_indexofC8
2546 
2547 // Small strings are loaded through stack if they cross page boundary.
2548 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2549                                        Register cnt1, Register cnt2,
2550                                        int int_cnt2,  Register result,
2551                                        XMMRegister vec, Register tmp,
2552                                        int ae) {
2553   ShortBranchVerifier sbv(this);
2554   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2555   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2556 
2557   //
2558   // int_cnt2 is length of small (< 8 chars) constant substring
2559   // or (-1) for non constant substring in which case its length
2560   // is in cnt2 register.
2561   //
2562   // Note, inline_string_indexOf() generates checks:
2563   // if (substr.count > string.count) return -1;
2564   // if (substr.count == 0) return 0;
2565   //
2566   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2567   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2568   // This method uses the pcmpestri instruction with bound registers
2569   //   inputs:
2570   //     xmm - substring
2571   //     rax - substring length (elements count)
2572   //     mem - scanned string
2573   //     rdx - string length (elements count)
2574   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2575   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2576   //   outputs:
2577   //     rcx - matched index in string
2578   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2579   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2580   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2581   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2582 
2583   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2584         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2585         FOUND_CANDIDATE;
2586 
2587   { //========================================================
2588     // We don't know where these strings are located
2589     // and we can't read beyond them. Load them through stack.
2590     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2591 
2592     movptr(tmp, rsp); // save old SP
2593 
2594     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2595       if (int_cnt2 == (1>>scale2)) { // One byte
2596         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2597         load_unsigned_byte(result, Address(str2, 0));
2598         movdl(vec, result); // move 32 bits
2599       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2600         // Not enough header space in 32-bit VM: 12+3 = 15.
2601         movl(result, Address(str2, -1));
2602         shrl(result, 8);
2603         movdl(vec, result); // move 32 bits
2604       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2605         load_unsigned_short(result, Address(str2, 0));
2606         movdl(vec, result); // move 32 bits
2607       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2608         movdl(vec, Address(str2, 0)); // move 32 bits
2609       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2610         movq(vec, Address(str2, 0));  // move 64 bits
2611       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2612         // Array header size is 12 bytes in 32-bit VM
2613         // + 6 bytes for 3 chars == 18 bytes,
2614         // enough space to load vec and shift.
2615         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2616         if (ae == StrIntrinsicNode::UL) {
2617           int tail_off = int_cnt2-8;
2618           pmovzxbw(vec, Address(str2, tail_off));
2619           psrldq(vec, -2*tail_off);
2620         }
2621         else {
2622           int tail_off = int_cnt2*(1<<scale2);
2623           movdqu(vec, Address(str2, tail_off-16));
2624           psrldq(vec, 16-tail_off);
2625         }
2626       }
2627     } else { // not constant substring
2628       cmpl(cnt2, stride);
2629       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2630 
2631       // We can read beyond string if srt+16 does not cross page boundary
2632       // since heaps are aligned and mapped by pages.
2633       assert(os::vm_page_size() < (int)G, "default page should be small");
2634       movl(result, str2); // We need only low 32 bits
2635       andl(result, (os::vm_page_size()-1));
2636       cmpl(result, (os::vm_page_size()-16));
2637       jccb(Assembler::belowEqual, CHECK_STR);
2638 
2639       // Move small strings to stack to allow load 16 bytes into vec.
2640       subptr(rsp, 16);
2641       int stk_offset = wordSize-(1<<scale2);
2642       push(cnt2);
2643 
2644       bind(COPY_SUBSTR);
2645       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2646         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2647         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2648       } else if (ae == StrIntrinsicNode::UU) {
2649         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2650         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2651       }
2652       decrement(cnt2);
2653       jccb(Assembler::notZero, COPY_SUBSTR);
2654 
2655       pop(cnt2);
2656       movptr(str2, rsp);  // New substring address
2657     } // non constant
2658 
2659     bind(CHECK_STR);
2660     cmpl(cnt1, stride);
2661     jccb(Assembler::aboveEqual, BIG_STRINGS);
2662 
2663     // Check cross page boundary.
2664     movl(result, str1); // We need only low 32 bits
2665     andl(result, (os::vm_page_size()-1));
2666     cmpl(result, (os::vm_page_size()-16));
2667     jccb(Assembler::belowEqual, BIG_STRINGS);
2668 
2669     subptr(rsp, 16);
2670     int stk_offset = -(1<<scale1);
2671     if (int_cnt2 < 0) { // not constant
2672       push(cnt2);
2673       stk_offset += wordSize;
2674     }
2675     movl(cnt2, cnt1);
2676 
2677     bind(COPY_STR);
2678     if (ae == StrIntrinsicNode::LL) {
2679       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2680       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2681     } else {
2682       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2683       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2684     }
2685     decrement(cnt2);
2686     jccb(Assembler::notZero, COPY_STR);
2687 
2688     if (int_cnt2 < 0) { // not constant
2689       pop(cnt2);
2690     }
2691     movptr(str1, rsp);  // New string address
2692 
2693     bind(BIG_STRINGS);
2694     // Load substring.
2695     if (int_cnt2 < 0) { // -1
2696       if (ae == StrIntrinsicNode::UL) {
2697         pmovzxbw(vec, Address(str2, 0));
2698       } else {
2699         movdqu(vec, Address(str2, 0));
2700       }
2701       push(cnt2);       // substr count
2702       push(str2);       // substr addr
2703       push(str1);       // string addr
2704     } else {
2705       // Small (< 8 chars) constant substrings are loaded already.
2706       movl(cnt2, int_cnt2);
2707     }
2708     push(tmp);  // original SP
2709 
2710   } // Finished loading
2711 
2712   //========================================================
2713   // Start search
2714   //
2715 
2716   movptr(result, str1); // string addr
2717 
2718   if (int_cnt2  < 0) {  // Only for non constant substring
2719     jmpb(SCAN_TO_SUBSTR);
2720 
2721     // SP saved at sp+0
2722     // String saved at sp+1*wordSize
2723     // Substr saved at sp+2*wordSize
2724     // Substr count saved at sp+3*wordSize
2725 
2726     // Reload substr for rescan, this code
2727     // is executed only for large substrings (> 8 chars)
2728     bind(RELOAD_SUBSTR);
2729     movptr(str2, Address(rsp, 2*wordSize));
2730     movl(cnt2, Address(rsp, 3*wordSize));
2731     if (ae == StrIntrinsicNode::UL) {
2732       pmovzxbw(vec, Address(str2, 0));
2733     } else {
2734       movdqu(vec, Address(str2, 0));
2735     }
2736     // We came here after the beginning of the substring was
2737     // matched but the rest of it was not so we need to search
2738     // again. Start from the next element after the previous match.
2739     subptr(str1, result); // Restore counter
2740     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2741       shrl(str1, 1);
2742     }
2743     addl(cnt1, str1);
2744     decrementl(cnt1);   // Shift to next element
2745     cmpl(cnt1, cnt2);
2746     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2747 
2748     addptr(result, (1<<scale1));
2749   } // non constant
2750 
2751   // Scan string for start of substr in 16-byte vectors
2752   bind(SCAN_TO_SUBSTR);
2753   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2754   pcmpestri(vec, Address(result, 0), mode);
2755   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2756   subl(cnt1, stride);
2757   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2758   cmpl(cnt1, cnt2);
2759   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2760   addptr(result, 16);
2761 
2762   bind(ADJUST_STR);
2763   cmpl(cnt1, stride); // Do not read beyond string
2764   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2765   // Back-up string to avoid reading beyond string.
2766   lea(result, Address(result, cnt1, scale1, -16));
2767   movl(cnt1, stride);
2768   jmpb(SCAN_TO_SUBSTR);
2769 
2770   // Found a potential substr
2771   bind(FOUND_CANDIDATE);
2772   // After pcmpestri tmp(rcx) contains matched element index
2773 
2774   // Make sure string is still long enough
2775   subl(cnt1, tmp);
2776   cmpl(cnt1, cnt2);
2777   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
2778   // Left less then substring.
2779 
2780   bind(RET_NOT_FOUND);
2781   movl(result, -1);
2782   jmp(CLEANUP);
2783 
2784   bind(FOUND_SUBSTR);
2785   // Compute start addr of substr
2786   lea(result, Address(result, tmp, scale1));
2787   if (int_cnt2 > 0) { // Constant substring
2788     // Repeat search for small substring (< 8 chars)
2789     // from new point without reloading substring.
2790     // Have to check that we don't read beyond string.
2791     cmpl(tmp, stride-int_cnt2);
2792     jccb(Assembler::greater, ADJUST_STR);
2793     // Fall through if matched whole substring.
2794   } else { // non constant
2795     assert(int_cnt2 == -1, "should be != 0");
2796 
2797     addl(tmp, cnt2);
2798     // Found result if we matched whole substring.
2799     cmpl(tmp, stride);
2800     jcc(Assembler::lessEqual, RET_FOUND);
2801 
2802     // Repeat search for small substring (<= 8 chars)
2803     // from new point 'str1' without reloading substring.
2804     cmpl(cnt2, stride);
2805     // Have to check that we don't read beyond string.
2806     jccb(Assembler::lessEqual, ADJUST_STR);
2807 
2808     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
2809     // Compare the rest of substring (> 8 chars).
2810     movptr(str1, result);
2811 
2812     cmpl(tmp, cnt2);
2813     // First 8 chars are already matched.
2814     jccb(Assembler::equal, CHECK_NEXT);
2815 
2816     bind(SCAN_SUBSTR);
2817     pcmpestri(vec, Address(str1, 0), mode);
2818     // Need to reload strings pointers if not matched whole vector
2819     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2820 
2821     bind(CHECK_NEXT);
2822     subl(cnt2, stride);
2823     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
2824     addptr(str1, 16);
2825     if (ae == StrIntrinsicNode::UL) {
2826       addptr(str2, 8);
2827     } else {
2828       addptr(str2, 16);
2829     }
2830     subl(cnt1, stride);
2831     cmpl(cnt2, stride); // Do not read beyond substring
2832     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
2833     // Back-up strings to avoid reading beyond substring.
2834 
2835     if (ae == StrIntrinsicNode::UL) {
2836       lea(str2, Address(str2, cnt2, scale2, -8));
2837       lea(str1, Address(str1, cnt2, scale1, -16));
2838     } else {
2839       lea(str2, Address(str2, cnt2, scale2, -16));
2840       lea(str1, Address(str1, cnt2, scale1, -16));
2841     }
2842     subl(cnt1, cnt2);
2843     movl(cnt2, stride);
2844     addl(cnt1, stride);
2845     bind(CONT_SCAN_SUBSTR);
2846     if (ae == StrIntrinsicNode::UL) {
2847       pmovzxbw(vec, Address(str2, 0));
2848     } else {
2849       movdqu(vec, Address(str2, 0));
2850     }
2851     jmp(SCAN_SUBSTR);
2852 
2853     bind(RET_FOUND_LONG);
2854     movptr(str1, Address(rsp, wordSize));
2855   } // non constant
2856 
2857   bind(RET_FOUND);
2858   // Compute substr offset
2859   subptr(result, str1);
2860   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2861     shrl(result, 1); // index
2862   }
2863   bind(CLEANUP);
2864   pop(rsp); // restore SP
2865 
2866 } // string_indexof
2867 
2868 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
2869                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
2870   ShortBranchVerifier sbv(this);
2871   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2872 
2873   int stride = 8;
2874 
2875   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
2876         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
2877         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
2878         FOUND_SEQ_CHAR, DONE_LABEL;
2879 
2880   movptr(result, str1);
2881   if (UseAVX >= 2) {
2882     cmpl(cnt1, stride);
2883     jcc(Assembler::less, SCAN_TO_CHAR);
2884     cmpl(cnt1, 2*stride);
2885     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
2886     movdl(vec1, ch);
2887     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
2888     vpxor(vec2, vec2);
2889     movl(tmp, cnt1);
2890     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
2891     andl(cnt1,0x0000000F);  //tail count (in chars)
2892 
2893     bind(SCAN_TO_16_CHAR_LOOP);
2894     vmovdqu(vec3, Address(result, 0));
2895     vpcmpeqw(vec3, vec3, vec1, 1);
2896     vptest(vec2, vec3);
2897     jcc(Assembler::carryClear, FOUND_CHAR);
2898     addptr(result, 32);
2899     subl(tmp, 2*stride);
2900     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
2901     jmp(SCAN_TO_8_CHAR);
2902     bind(SCAN_TO_8_CHAR_INIT);
2903     movdl(vec1, ch);
2904     pshuflw(vec1, vec1, 0x00);
2905     pshufd(vec1, vec1, 0);
2906     pxor(vec2, vec2);
2907   }
2908   bind(SCAN_TO_8_CHAR);
2909   cmpl(cnt1, stride);
2910   jcc(Assembler::less, SCAN_TO_CHAR);
2911   if (UseAVX < 2) {
2912     movdl(vec1, ch);
2913     pshuflw(vec1, vec1, 0x00);
2914     pshufd(vec1, vec1, 0);
2915     pxor(vec2, vec2);
2916   }
2917   movl(tmp, cnt1);
2918   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
2919   andl(cnt1,0x00000007);  //tail count (in chars)
2920 
2921   bind(SCAN_TO_8_CHAR_LOOP);
2922   movdqu(vec3, Address(result, 0));
2923   pcmpeqw(vec3, vec1);
2924   ptest(vec2, vec3);
2925   jcc(Assembler::carryClear, FOUND_CHAR);
2926   addptr(result, 16);
2927   subl(tmp, stride);
2928   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
2929   bind(SCAN_TO_CHAR);
2930   testl(cnt1, cnt1);
2931   jcc(Assembler::zero, RET_NOT_FOUND);
2932   bind(SCAN_TO_CHAR_LOOP);
2933   load_unsigned_short(tmp, Address(result, 0));
2934   cmpl(ch, tmp);
2935   jccb(Assembler::equal, FOUND_SEQ_CHAR);
2936   addptr(result, 2);
2937   subl(cnt1, 1);
2938   jccb(Assembler::zero, RET_NOT_FOUND);
2939   jmp(SCAN_TO_CHAR_LOOP);
2940 
2941   bind(RET_NOT_FOUND);
2942   movl(result, -1);
2943   jmpb(DONE_LABEL);
2944 
2945   bind(FOUND_CHAR);
2946   if (UseAVX >= 2) {
2947     vpmovmskb(tmp, vec3);
2948   } else {
2949     pmovmskb(tmp, vec3);
2950   }
2951   bsfl(ch, tmp);
2952   addptr(result, ch);
2953 
2954   bind(FOUND_SEQ_CHAR);
2955   subptr(result, str1);
2956   shrl(result, 1);
2957 
2958   bind(DONE_LABEL);
2959 } // string_indexof_char
2960 
2961 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
2962                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
2963   ShortBranchVerifier sbv(this);
2964   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2965 
2966   int stride = 16;
2967 
2968   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
2969         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
2970         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
2971         FOUND_SEQ_CHAR, DONE_LABEL;
2972 
2973   movptr(result, str1);
2974   if (UseAVX >= 2) {
2975     cmpl(cnt1, stride);
2976     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
2977     cmpl(cnt1, stride*2);
2978     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
2979     movdl(vec1, ch);
2980     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
2981     vpxor(vec2, vec2);
2982     movl(tmp, cnt1);
2983     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
2984     andl(cnt1,0x0000001F);  //tail count (in chars)
2985 
2986     bind(SCAN_TO_32_CHAR_LOOP);
2987     vmovdqu(vec3, Address(result, 0));
2988     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
2989     vptest(vec2, vec3);
2990     jcc(Assembler::carryClear, FOUND_CHAR);
2991     addptr(result, 32);
2992     subl(tmp, stride*2);
2993     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
2994     jmp(SCAN_TO_16_CHAR);
2995 
2996     bind(SCAN_TO_16_CHAR_INIT);
2997     movdl(vec1, ch);
2998     pxor(vec2, vec2);
2999     pshufb(vec1, vec2);
3000   }
3001 
3002   bind(SCAN_TO_16_CHAR);
3003   cmpl(cnt1, stride);
3004   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entires left
3005   if (UseAVX < 2) {
3006     movdl(vec1, ch);
3007     pxor(vec2, vec2);
3008     pshufb(vec1, vec2);
3009   }
3010   movl(tmp, cnt1);
3011   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3012   andl(cnt1,0x0000000F);  //tail count (in bytes)
3013 
3014   bind(SCAN_TO_16_CHAR_LOOP);
3015   movdqu(vec3, Address(result, 0));
3016   pcmpeqb(vec3, vec1);
3017   ptest(vec2, vec3);
3018   jcc(Assembler::carryClear, FOUND_CHAR);
3019   addptr(result, 16);
3020   subl(tmp, stride);
3021   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3022 
3023   bind(SCAN_TO_CHAR_INIT);
3024   testl(cnt1, cnt1);
3025   jcc(Assembler::zero, RET_NOT_FOUND);
3026   bind(SCAN_TO_CHAR_LOOP);
3027   load_unsigned_byte(tmp, Address(result, 0));
3028   cmpl(ch, tmp);
3029   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3030   addptr(result, 1);
3031   subl(cnt1, 1);
3032   jccb(Assembler::zero, RET_NOT_FOUND);
3033   jmp(SCAN_TO_CHAR_LOOP);
3034 
3035   bind(RET_NOT_FOUND);
3036   movl(result, -1);
3037   jmpb(DONE_LABEL);
3038 
3039   bind(FOUND_CHAR);
3040   if (UseAVX >= 2) {
3041     vpmovmskb(tmp, vec3);
3042   } else {
3043     pmovmskb(tmp, vec3);
3044   }
3045   bsfl(ch, tmp);
3046   addptr(result, ch);
3047 
3048   bind(FOUND_SEQ_CHAR);
3049   subptr(result, str1);
3050 
3051   bind(DONE_LABEL);
3052 } // stringL_indexof_char
3053 
3054 // helper function for string_compare
3055 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3056                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3057                                            Address::ScaleFactor scale2, Register index, int ae) {
3058   if (ae == StrIntrinsicNode::LL) {
3059     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3060     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3061   } else if (ae == StrIntrinsicNode::UU) {
3062     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3063     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3064   } else {
3065     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3066     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3067   }
3068 }
3069 
3070 // Compare strings, used for char[] and byte[].
3071 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3072                                        Register cnt1, Register cnt2, Register result,
3073                                        XMMRegister vec1, int ae, KRegister mask) {
3074   ShortBranchVerifier sbv(this);
3075   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3076   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3077   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3078   int stride2x2 = 0x40;
3079   Address::ScaleFactor scale = Address::no_scale;
3080   Address::ScaleFactor scale1 = Address::no_scale;
3081   Address::ScaleFactor scale2 = Address::no_scale;
3082 
3083   if (ae != StrIntrinsicNode::LL) {
3084     stride2x2 = 0x20;
3085   }
3086 
3087   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3088     shrl(cnt2, 1);
3089   }
3090   // Compute the minimum of the string lengths and the
3091   // difference of the string lengths (stack).
3092   // Do the conditional move stuff
3093   movl(result, cnt1);
3094   subl(cnt1, cnt2);
3095   push(cnt1);
3096   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3097 
3098   // Is the minimum length zero?
3099   testl(cnt2, cnt2);
3100   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3101   if (ae == StrIntrinsicNode::LL) {
3102     // Load first bytes
3103     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3104     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3105   } else if (ae == StrIntrinsicNode::UU) {
3106     // Load first characters
3107     load_unsigned_short(result, Address(str1, 0));
3108     load_unsigned_short(cnt1, Address(str2, 0));
3109   } else {
3110     load_unsigned_byte(result, Address(str1, 0));
3111     load_unsigned_short(cnt1, Address(str2, 0));
3112   }
3113   subl(result, cnt1);
3114   jcc(Assembler::notZero,  POP_LABEL);
3115 
3116   if (ae == StrIntrinsicNode::UU) {
3117     // Divide length by 2 to get number of chars
3118     shrl(cnt2, 1);
3119   }
3120   cmpl(cnt2, 1);
3121   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3122 
3123   // Check if the strings start at the same location and setup scale and stride
3124   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3125     cmpptr(str1, str2);
3126     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3127     if (ae == StrIntrinsicNode::LL) {
3128       scale = Address::times_1;
3129       stride = 16;
3130     } else {
3131       scale = Address::times_2;
3132       stride = 8;
3133     }
3134   } else {
3135     scale1 = Address::times_1;
3136     scale2 = Address::times_2;
3137     // scale not used
3138     stride = 8;
3139   }
3140 
3141   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3142     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3143     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3144     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3145     Label COMPARE_TAIL_LONG;
3146     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3147 
3148     int pcmpmask = 0x19;
3149     if (ae == StrIntrinsicNode::LL) {
3150       pcmpmask &= ~0x01;
3151     }
3152 
3153     // Setup to compare 16-chars (32-bytes) vectors,
3154     // start from first character again because it has aligned address.
3155     if (ae == StrIntrinsicNode::LL) {
3156       stride2 = 32;
3157     } else {
3158       stride2 = 16;
3159     }
3160     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3161       adr_stride = stride << scale;
3162     } else {
3163       adr_stride1 = 8;  //stride << scale1;
3164       adr_stride2 = 16; //stride << scale2;
3165     }
3166 
3167     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3168     // rax and rdx are used by pcmpestri as elements counters
3169     movl(result, cnt2);
3170     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3171     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3172 
3173     // fast path : compare first 2 8-char vectors.
3174     bind(COMPARE_16_CHARS);
3175     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3176       movdqu(vec1, Address(str1, 0));
3177     } else {
3178       pmovzxbw(vec1, Address(str1, 0));
3179     }
3180     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3181     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3182 
3183     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3184       movdqu(vec1, Address(str1, adr_stride));
3185       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3186     } else {
3187       pmovzxbw(vec1, Address(str1, adr_stride1));
3188       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3189     }
3190     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3191     addl(cnt1, stride);
3192 
3193     // Compare the characters at index in cnt1
3194     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3195     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3196     subl(result, cnt2);
3197     jmp(POP_LABEL);
3198 
3199     // Setup the registers to start vector comparison loop
3200     bind(COMPARE_WIDE_VECTORS);
3201     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3202       lea(str1, Address(str1, result, scale));
3203       lea(str2, Address(str2, result, scale));
3204     } else {
3205       lea(str1, Address(str1, result, scale1));
3206       lea(str2, Address(str2, result, scale2));
3207     }
3208     subl(result, stride2);
3209     subl(cnt2, stride2);
3210     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3211     negptr(result);
3212 
3213     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3214     bind(COMPARE_WIDE_VECTORS_LOOP);
3215 
3216 #ifdef _LP64
3217     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3218       cmpl(cnt2, stride2x2);
3219       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3220       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3221       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3222 
3223       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3224       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3225         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3226         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3227       } else {
3228         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3229         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3230       }
3231       kortestql(mask, mask);
3232       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3233       addptr(result, stride2x2);  // update since we already compared at this addr
3234       subl(cnt2, stride2x2);      // and sub the size too
3235       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3236 
3237       vpxor(vec1, vec1);
3238       jmpb(COMPARE_WIDE_TAIL);
3239     }//if (VM_Version::supports_avx512vlbw())
3240 #endif // _LP64
3241 
3242 
3243     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3244     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3245       vmovdqu(vec1, Address(str1, result, scale));
3246       vpxor(vec1, Address(str2, result, scale));
3247     } else {
3248       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3249       vpxor(vec1, Address(str2, result, scale2));
3250     }
3251     vptest(vec1, vec1);
3252     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3253     addptr(result, stride2);
3254     subl(cnt2, stride2);
3255     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3256     // clean upper bits of YMM registers
3257     vpxor(vec1, vec1);
3258 
3259     // compare wide vectors tail
3260     bind(COMPARE_WIDE_TAIL);
3261     testptr(result, result);
3262     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3263 
3264     movl(result, stride2);
3265     movl(cnt2, result);
3266     negptr(result);
3267     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3268 
3269     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3270     bind(VECTOR_NOT_EQUAL);
3271     // clean upper bits of YMM registers
3272     vpxor(vec1, vec1);
3273     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3274       lea(str1, Address(str1, result, scale));
3275       lea(str2, Address(str2, result, scale));
3276     } else {
3277       lea(str1, Address(str1, result, scale1));
3278       lea(str2, Address(str2, result, scale2));
3279     }
3280     jmp(COMPARE_16_CHARS);
3281 
3282     // Compare tail chars, length between 1 to 15 chars
3283     bind(COMPARE_TAIL_LONG);
3284     movl(cnt2, result);
3285     cmpl(cnt2, stride);
3286     jcc(Assembler::less, COMPARE_SMALL_STR);
3287 
3288     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3289       movdqu(vec1, Address(str1, 0));
3290     } else {
3291       pmovzxbw(vec1, Address(str1, 0));
3292     }
3293     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3294     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3295     subptr(cnt2, stride);
3296     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3297     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3298       lea(str1, Address(str1, result, scale));
3299       lea(str2, Address(str2, result, scale));
3300     } else {
3301       lea(str1, Address(str1, result, scale1));
3302       lea(str2, Address(str2, result, scale2));
3303     }
3304     negptr(cnt2);
3305     jmpb(WHILE_HEAD_LABEL);
3306 
3307     bind(COMPARE_SMALL_STR);
3308   } else if (UseSSE42Intrinsics) {
3309     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3310     int pcmpmask = 0x19;
3311     // Setup to compare 8-char (16-byte) vectors,
3312     // start from first character again because it has aligned address.
3313     movl(result, cnt2);
3314     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3315     if (ae == StrIntrinsicNode::LL) {
3316       pcmpmask &= ~0x01;
3317     }
3318     jcc(Assembler::zero, COMPARE_TAIL);
3319     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3320       lea(str1, Address(str1, result, scale));
3321       lea(str2, Address(str2, result, scale));
3322     } else {
3323       lea(str1, Address(str1, result, scale1));
3324       lea(str2, Address(str2, result, scale2));
3325     }
3326     negptr(result);
3327 
3328     // pcmpestri
3329     //   inputs:
3330     //     vec1- substring
3331     //     rax - negative string length (elements count)
3332     //     mem - scanned string
3333     //     rdx - string length (elements count)
3334     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3335     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3336     //   outputs:
3337     //     rcx - first mismatched element index
3338     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3339 
3340     bind(COMPARE_WIDE_VECTORS);
3341     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3342       movdqu(vec1, Address(str1, result, scale));
3343       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3344     } else {
3345       pmovzxbw(vec1, Address(str1, result, scale1));
3346       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3347     }
3348     // After pcmpestri cnt1(rcx) contains mismatched element index
3349 
3350     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3351     addptr(result, stride);
3352     subptr(cnt2, stride);
3353     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3354 
3355     // compare wide vectors tail
3356     testptr(result, result);
3357     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3358 
3359     movl(cnt2, stride);
3360     movl(result, stride);
3361     negptr(result);
3362     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3363       movdqu(vec1, Address(str1, result, scale));
3364       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3365     } else {
3366       pmovzxbw(vec1, Address(str1, result, scale1));
3367       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3368     }
3369     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3370 
3371     // Mismatched characters in the vectors
3372     bind(VECTOR_NOT_EQUAL);
3373     addptr(cnt1, result);
3374     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3375     subl(result, cnt2);
3376     jmpb(POP_LABEL);
3377 
3378     bind(COMPARE_TAIL); // limit is zero
3379     movl(cnt2, result);
3380     // Fallthru to tail compare
3381   }
3382   // Shift str2 and str1 to the end of the arrays, negate min
3383   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3384     lea(str1, Address(str1, cnt2, scale));
3385     lea(str2, Address(str2, cnt2, scale));
3386   } else {
3387     lea(str1, Address(str1, cnt2, scale1));
3388     lea(str2, Address(str2, cnt2, scale2));
3389   }
3390   decrementl(cnt2);  // first character was compared already
3391   negptr(cnt2);
3392 
3393   // Compare the rest of the elements
3394   bind(WHILE_HEAD_LABEL);
3395   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3396   subl(result, cnt1);
3397   jccb(Assembler::notZero, POP_LABEL);
3398   increment(cnt2);
3399   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3400 
3401   // Strings are equal up to min length.  Return the length difference.
3402   bind(LENGTH_DIFF_LABEL);
3403   pop(result);
3404   if (ae == StrIntrinsicNode::UU) {
3405     // Divide diff by 2 to get number of chars
3406     sarl(result, 1);
3407   }
3408   jmpb(DONE_LABEL);
3409 
3410 #ifdef _LP64
3411   if (VM_Version::supports_avx512vlbw()) {
3412 
3413     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3414 
3415     kmovql(cnt1, mask);
3416     notq(cnt1);
3417     bsfq(cnt2, cnt1);
3418     if (ae != StrIntrinsicNode::LL) {
3419       // Divide diff by 2 to get number of chars
3420       sarl(cnt2, 1);
3421     }
3422     addq(result, cnt2);
3423     if (ae == StrIntrinsicNode::LL) {
3424       load_unsigned_byte(cnt1, Address(str2, result));
3425       load_unsigned_byte(result, Address(str1, result));
3426     } else if (ae == StrIntrinsicNode::UU) {
3427       load_unsigned_short(cnt1, Address(str2, result, scale));
3428       load_unsigned_short(result, Address(str1, result, scale));
3429     } else {
3430       load_unsigned_short(cnt1, Address(str2, result, scale2));
3431       load_unsigned_byte(result, Address(str1, result, scale1));
3432     }
3433     subl(result, cnt1);
3434     jmpb(POP_LABEL);
3435   }//if (VM_Version::supports_avx512vlbw())
3436 #endif // _LP64
3437 
3438   // Discard the stored length difference
3439   bind(POP_LABEL);
3440   pop(cnt1);
3441 
3442   // That's it
3443   bind(DONE_LABEL);
3444   if(ae == StrIntrinsicNode::UL) {
3445     negl(result);
3446   }
3447 
3448 }
3449 
3450 // Search for Non-ASCII character (Negative byte value) in a byte array,
3451 // return true if it has any and false otherwise.
3452 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3453 //   @IntrinsicCandidate
3454 //   private static boolean hasNegatives(byte[] ba, int off, int len) {
3455 //     for (int i = off; i < off + len; i++) {
3456 //       if (ba[i] < 0) {
3457 //         return true;
3458 //       }
3459 //     }
3460 //     return false;
3461 //   }
3462 void C2_MacroAssembler::has_negatives(Register ary1, Register len,
3463   Register result, Register tmp1,
3464   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3465   // rsi: byte array
3466   // rcx: len
3467   // rax: result
3468   ShortBranchVerifier sbv(this);
3469   assert_different_registers(ary1, len, result, tmp1);
3470   assert_different_registers(vec1, vec2);
3471   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3472 
3473   // len == 0
3474   testl(len, len);
3475   jcc(Assembler::zero, FALSE_LABEL);
3476 
3477   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3478     VM_Version::supports_avx512vlbw() &&
3479     VM_Version::supports_bmi2()) {
3480 
3481     Label test_64_loop, test_tail;
3482     Register tmp3_aliased = len;
3483 
3484     movl(tmp1, len);
3485     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3486 
3487     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
3488     andl(len, ~(64 - 1));    // vector count (in chars)
3489     jccb(Assembler::zero, test_tail);
3490 
3491     lea(ary1, Address(ary1, len, Address::times_1));
3492     negptr(len);
3493 
3494     bind(test_64_loop);
3495     // Check whether our 64 elements of size byte contain negatives
3496     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3497     kortestql(mask1, mask1);
3498     jcc(Assembler::notZero, TRUE_LABEL);
3499 
3500     addptr(len, 64);
3501     jccb(Assembler::notZero, test_64_loop);
3502 
3503 
3504     bind(test_tail);
3505     // bail out when there is nothing to be done
3506     testl(tmp1, -1);
3507     jcc(Assembler::zero, FALSE_LABEL);
3508 
3509     // ~(~0 << len) applied up to two times (for 32-bit scenario)
3510 #ifdef _LP64
3511     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3512     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3513     notq(tmp3_aliased);
3514     kmovql(mask2, tmp3_aliased);
3515 #else
3516     Label k_init;
3517     jmp(k_init);
3518 
3519     // We could not read 64-bits from a general purpose register thus we move
3520     // data required to compose 64 1's to the instruction stream
3521     // We emit 64 byte wide series of elements from 0..63 which later on would
3522     // be used as a compare targets with tail count contained in tmp1 register.
3523     // Result would be a k register having tmp1 consecutive number or 1
3524     // counting from least significant bit.
3525     address tmp = pc();
3526     emit_int64(0x0706050403020100);
3527     emit_int64(0x0F0E0D0C0B0A0908);
3528     emit_int64(0x1716151413121110);
3529     emit_int64(0x1F1E1D1C1B1A1918);
3530     emit_int64(0x2726252423222120);
3531     emit_int64(0x2F2E2D2C2B2A2928);
3532     emit_int64(0x3736353433323130);
3533     emit_int64(0x3F3E3D3C3B3A3938);
3534 
3535     bind(k_init);
3536     lea(len, InternalAddress(tmp));
3537     // create mask to test for negative byte inside a vector
3538     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3539     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
3540 
3541 #endif
3542     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3543     ktestq(mask1, mask2);
3544     jcc(Assembler::notZero, TRUE_LABEL);
3545 
3546     jmp(FALSE_LABEL);
3547   } else {
3548     movl(result, len); // copy
3549 
3550     if (UseAVX >= 2 && UseSSE >= 2) {
3551       // With AVX2, use 32-byte vector compare
3552       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3553 
3554       // Compare 32-byte vectors
3555       andl(result, 0x0000001f);  //   tail count (in bytes)
3556       andl(len, 0xffffffe0);   // vector count (in bytes)
3557       jccb(Assembler::zero, COMPARE_TAIL);
3558 
3559       lea(ary1, Address(ary1, len, Address::times_1));
3560       negptr(len);
3561 
3562       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
3563       movdl(vec2, tmp1);
3564       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
3565 
3566       bind(COMPARE_WIDE_VECTORS);
3567       vmovdqu(vec1, Address(ary1, len, Address::times_1));
3568       vptest(vec1, vec2);
3569       jccb(Assembler::notZero, TRUE_LABEL);
3570       addptr(len, 32);
3571       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3572 
3573       testl(result, result);
3574       jccb(Assembler::zero, FALSE_LABEL);
3575 
3576       vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3577       vptest(vec1, vec2);
3578       jccb(Assembler::notZero, TRUE_LABEL);
3579       jmpb(FALSE_LABEL);
3580 
3581       bind(COMPARE_TAIL); // len is zero
3582       movl(len, result);
3583       // Fallthru to tail compare
3584     } else if (UseSSE42Intrinsics) {
3585       // With SSE4.2, use double quad vector compare
3586       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3587 
3588       // Compare 16-byte vectors
3589       andl(result, 0x0000000f);  //   tail count (in bytes)
3590       andl(len, 0xfffffff0);   // vector count (in bytes)
3591       jcc(Assembler::zero, COMPARE_TAIL);
3592 
3593       lea(ary1, Address(ary1, len, Address::times_1));
3594       negptr(len);
3595 
3596       movl(tmp1, 0x80808080);
3597       movdl(vec2, tmp1);
3598       pshufd(vec2, vec2, 0);
3599 
3600       bind(COMPARE_WIDE_VECTORS);
3601       movdqu(vec1, Address(ary1, len, Address::times_1));
3602       ptest(vec1, vec2);
3603       jcc(Assembler::notZero, TRUE_LABEL);
3604       addptr(len, 16);
3605       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3606 
3607       testl(result, result);
3608       jcc(Assembler::zero, FALSE_LABEL);
3609 
3610       movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3611       ptest(vec1, vec2);
3612       jccb(Assembler::notZero, TRUE_LABEL);
3613       jmpb(FALSE_LABEL);
3614 
3615       bind(COMPARE_TAIL); // len is zero
3616       movl(len, result);
3617       // Fallthru to tail compare
3618     }
3619   }
3620   // Compare 4-byte vectors
3621   andl(len, 0xfffffffc); // vector count (in bytes)
3622   jccb(Assembler::zero, COMPARE_CHAR);
3623 
3624   lea(ary1, Address(ary1, len, Address::times_1));
3625   negptr(len);
3626 
3627   bind(COMPARE_VECTORS);
3628   movl(tmp1, Address(ary1, len, Address::times_1));
3629   andl(tmp1, 0x80808080);
3630   jccb(Assembler::notZero, TRUE_LABEL);
3631   addptr(len, 4);
3632   jcc(Assembler::notZero, COMPARE_VECTORS);
3633 
3634   // Compare trailing char (final 2 bytes), if any
3635   bind(COMPARE_CHAR);
3636   testl(result, 0x2);   // tail  char
3637   jccb(Assembler::zero, COMPARE_BYTE);
3638   load_unsigned_short(tmp1, Address(ary1, 0));
3639   andl(tmp1, 0x00008080);
3640   jccb(Assembler::notZero, TRUE_LABEL);
3641   subptr(result, 2);
3642   lea(ary1, Address(ary1, 2));
3643 
3644   bind(COMPARE_BYTE);
3645   testl(result, 0x1);   // tail  byte
3646   jccb(Assembler::zero, FALSE_LABEL);
3647   load_unsigned_byte(tmp1, Address(ary1, 0));
3648   andl(tmp1, 0x00000080);
3649   jccb(Assembler::notEqual, TRUE_LABEL);
3650   jmpb(FALSE_LABEL);
3651 
3652   bind(TRUE_LABEL);
3653   movl(result, 1);   // return true
3654   jmpb(DONE);
3655 
3656   bind(FALSE_LABEL);
3657   xorl(result, result); // return false
3658 
3659   // That's it
3660   bind(DONE);
3661   if (UseAVX >= 2 && UseSSE >= 2) {
3662     // clean upper bits of YMM registers
3663     vpxor(vec1, vec1);
3664     vpxor(vec2, vec2);
3665   }
3666 }
3667 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
3668 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
3669                                       Register limit, Register result, Register chr,
3670                                       XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
3671   ShortBranchVerifier sbv(this);
3672   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
3673 
3674   int length_offset  = arrayOopDesc::length_offset_in_bytes();
3675   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
3676 
3677   if (is_array_equ) {
3678     // Check the input args
3679     cmpoop(ary1, ary2);
3680     jcc(Assembler::equal, TRUE_LABEL);
3681 
3682     // Need additional checks for arrays_equals.
3683     testptr(ary1, ary1);
3684     jcc(Assembler::zero, FALSE_LABEL);
3685     testptr(ary2, ary2);
3686     jcc(Assembler::zero, FALSE_LABEL);
3687 
3688     // Check the lengths
3689     movl(limit, Address(ary1, length_offset));
3690     cmpl(limit, Address(ary2, length_offset));
3691     jcc(Assembler::notEqual, FALSE_LABEL);
3692   }
3693 
3694   // count == 0
3695   testl(limit, limit);
3696   jcc(Assembler::zero, TRUE_LABEL);
3697 
3698   if (is_array_equ) {
3699     // Load array address
3700     lea(ary1, Address(ary1, base_offset));
3701     lea(ary2, Address(ary2, base_offset));
3702   }
3703 
3704   if (is_array_equ && is_char) {
3705     // arrays_equals when used for char[].
3706     shll(limit, 1);      // byte count != 0
3707   }
3708   movl(result, limit); // copy
3709 
3710   if (UseAVX >= 2) {
3711     // With AVX2, use 32-byte vector compare
3712     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3713 
3714     // Compare 32-byte vectors
3715     andl(result, 0x0000001f);  //   tail count (in bytes)
3716     andl(limit, 0xffffffe0);   // vector count (in bytes)
3717     jcc(Assembler::zero, COMPARE_TAIL);
3718 
3719     lea(ary1, Address(ary1, limit, Address::times_1));
3720     lea(ary2, Address(ary2, limit, Address::times_1));
3721     negptr(limit);
3722 
3723 #ifdef _LP64
3724     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3725       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
3726 
3727       cmpl(limit, -64);
3728       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3729 
3730       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3731 
3732       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
3733       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
3734       kortestql(mask, mask);
3735       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3736       addptr(limit, 64);  // update since we already compared at this addr
3737       cmpl(limit, -64);
3738       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3739 
3740       // At this point we may still need to compare -limit+result bytes.
3741       // We could execute the next two instruction and just continue via non-wide path:
3742       //  cmpl(limit, 0);
3743       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
3744       // But since we stopped at the points ary{1,2}+limit which are
3745       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
3746       // (|limit| <= 32 and result < 32),
3747       // we may just compare the last 64 bytes.
3748       //
3749       addptr(result, -64);   // it is safe, bc we just came from this area
3750       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
3751       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
3752       kortestql(mask, mask);
3753       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3754 
3755       jmp(TRUE_LABEL);
3756 
3757       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3758 
3759     }//if (VM_Version::supports_avx512vlbw())
3760 #endif //_LP64
3761     bind(COMPARE_WIDE_VECTORS);
3762     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
3763     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
3764     vpxor(vec1, vec2);
3765 
3766     vptest(vec1, vec1);
3767     jcc(Assembler::notZero, FALSE_LABEL);
3768     addptr(limit, 32);
3769     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3770 
3771     testl(result, result);
3772     jcc(Assembler::zero, TRUE_LABEL);
3773 
3774     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3775     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
3776     vpxor(vec1, vec2);
3777 
3778     vptest(vec1, vec1);
3779     jccb(Assembler::notZero, FALSE_LABEL);
3780     jmpb(TRUE_LABEL);
3781 
3782     bind(COMPARE_TAIL); // limit is zero
3783     movl(limit, result);
3784     // Fallthru to tail compare
3785   } else if (UseSSE42Intrinsics) {
3786     // With SSE4.2, use double quad vector compare
3787     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3788 
3789     // Compare 16-byte vectors
3790     andl(result, 0x0000000f);  //   tail count (in bytes)
3791     andl(limit, 0xfffffff0);   // vector count (in bytes)
3792     jcc(Assembler::zero, COMPARE_TAIL);
3793 
3794     lea(ary1, Address(ary1, limit, Address::times_1));
3795     lea(ary2, Address(ary2, limit, Address::times_1));
3796     negptr(limit);
3797 
3798     bind(COMPARE_WIDE_VECTORS);
3799     movdqu(vec1, Address(ary1, limit, Address::times_1));
3800     movdqu(vec2, Address(ary2, limit, Address::times_1));
3801     pxor(vec1, vec2);
3802 
3803     ptest(vec1, vec1);
3804     jcc(Assembler::notZero, FALSE_LABEL);
3805     addptr(limit, 16);
3806     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3807 
3808     testl(result, result);
3809     jcc(Assembler::zero, TRUE_LABEL);
3810 
3811     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3812     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
3813     pxor(vec1, vec2);
3814 
3815     ptest(vec1, vec1);
3816     jccb(Assembler::notZero, FALSE_LABEL);
3817     jmpb(TRUE_LABEL);
3818 
3819     bind(COMPARE_TAIL); // limit is zero
3820     movl(limit, result);
3821     // Fallthru to tail compare
3822   }
3823 
3824   // Compare 4-byte vectors
3825   andl(limit, 0xfffffffc); // vector count (in bytes)
3826   jccb(Assembler::zero, COMPARE_CHAR);
3827 
3828   lea(ary1, Address(ary1, limit, Address::times_1));
3829   lea(ary2, Address(ary2, limit, Address::times_1));
3830   negptr(limit);
3831 
3832   bind(COMPARE_VECTORS);
3833   movl(chr, Address(ary1, limit, Address::times_1));
3834   cmpl(chr, Address(ary2, limit, Address::times_1));
3835   jccb(Assembler::notEqual, FALSE_LABEL);
3836   addptr(limit, 4);
3837   jcc(Assembler::notZero, COMPARE_VECTORS);
3838 
3839   // Compare trailing char (final 2 bytes), if any
3840   bind(COMPARE_CHAR);
3841   testl(result, 0x2);   // tail  char
3842   jccb(Assembler::zero, COMPARE_BYTE);
3843   load_unsigned_short(chr, Address(ary1, 0));
3844   load_unsigned_short(limit, Address(ary2, 0));
3845   cmpl(chr, limit);
3846   jccb(Assembler::notEqual, FALSE_LABEL);
3847 
3848   if (is_array_equ && is_char) {
3849     bind(COMPARE_BYTE);
3850   } else {
3851     lea(ary1, Address(ary1, 2));
3852     lea(ary2, Address(ary2, 2));
3853 
3854     bind(COMPARE_BYTE);
3855     testl(result, 0x1);   // tail  byte
3856     jccb(Assembler::zero, TRUE_LABEL);
3857     load_unsigned_byte(chr, Address(ary1, 0));
3858     load_unsigned_byte(limit, Address(ary2, 0));
3859     cmpl(chr, limit);
3860     jccb(Assembler::notEqual, FALSE_LABEL);
3861   }
3862   bind(TRUE_LABEL);
3863   movl(result, 1);   // return true
3864   jmpb(DONE);
3865 
3866   bind(FALSE_LABEL);
3867   xorl(result, result); // return false
3868 
3869   // That's it
3870   bind(DONE);
3871   if (UseAVX >= 2) {
3872     // clean upper bits of YMM registers
3873     vpxor(vec1, vec1);
3874     vpxor(vec2, vec2);
3875   }
3876 }
3877 
3878 #ifdef _LP64
3879 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
3880                                               Register tmp, KRegister ktmp, int masklen, int vec_enc) {
3881   assert(VM_Version::supports_avx512vlbw(), "");
3882   vpxor(xtmp, xtmp, xtmp, vec_enc);
3883   vpsubb(xtmp, xtmp, mask, vec_enc);
3884   evpmovb2m(ktmp, xtmp, vec_enc);
3885   kmovql(tmp, ktmp);
3886   switch(opc) {
3887     case Op_VectorMaskTrueCount:
3888       popcntq(dst, tmp);
3889       break;
3890     case Op_VectorMaskLastTrue:
3891       mov64(dst, -1);
3892       bsrq(tmp, tmp);
3893       cmov(Assembler::notZero, dst, tmp);
3894       break;
3895     case Op_VectorMaskFirstTrue:
3896       mov64(dst, masklen);
3897       bsfq(tmp, tmp);
3898       cmov(Assembler::notZero, dst, tmp);
3899       break;
3900     default: assert(false, "Unhandled mask operation");
3901   }
3902 }
3903 
3904 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
3905                                               XMMRegister xtmp1, Register tmp, int masklen, int vec_enc) {
3906   assert(VM_Version::supports_avx(), "");
3907   vpxor(xtmp, xtmp, xtmp, vec_enc);
3908   vpsubb(xtmp, xtmp, mask, vec_enc);
3909   vpmovmskb(tmp, xtmp, vec_enc);
3910   if (masklen < 64) {
3911     andq(tmp, (((jlong)1 << masklen) - 1));
3912   }
3913   switch(opc) {
3914     case Op_VectorMaskTrueCount:
3915       popcntq(dst, tmp);
3916       break;
3917     case Op_VectorMaskLastTrue:
3918       mov64(dst, -1);
3919       bsrq(tmp, tmp);
3920       cmov(Assembler::notZero, dst, tmp);
3921       break;
3922     case Op_VectorMaskFirstTrue:
3923       mov64(dst, masklen);
3924       bsfq(tmp, tmp);
3925       cmov(Assembler::notZero, dst, tmp);
3926       break;
3927     default: assert(false, "Unhandled mask operation");
3928   }
3929 }
3930 #endif
3931 
3932 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
3933                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
3934                                         int vlen_enc) {
3935   assert(VM_Version::supports_avx512bw(), "");
3936   // Byte shuffles are inlane operations and indices are determined using
3937   // lower 4 bit of each shuffle lane, thus all shuffle indices are
3938   // normalized to index range 0-15. This makes sure that all the multiples
3939   // of an index value are placed at same relative position in 128 bit
3940   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
3941   // will be 16th element in their respective 128 bit lanes.
3942   movl(rtmp, 16);
3943   evpbroadcastb(xtmp1, rtmp, vlen_enc);
3944 
3945   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
3946   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
3947   // original shuffle indices and move the shuffled lanes corresponding to true
3948   // mask to destination vector.
3949   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
3950   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
3951   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
3952 
3953   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
3954   // and broadcasting second 128 bit lane.
3955   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
3956   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
3957   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
3958   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
3959   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
3960 
3961   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
3962   // and broadcasting third 128 bit lane.
3963   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
3964   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
3965   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
3966   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
3967   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
3968 
3969   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
3970   // and broadcasting third 128 bit lane.
3971   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
3972   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
3973   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
3974   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
3975   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
3976 }