1 /*
   2  * Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "oops/methodData.hpp"
  29 #include "opto/c2_CodeStubs.hpp"
  30 #include "opto/c2_MacroAssembler.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/opcodes.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/subnode.hpp"
  35 #include "runtime/biasedLocking.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 
  39 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
  40   switch (vlen_in_bytes) {
  41     case  4: // fall-through
  42     case  8: // fall-through
  43     case 16: return Assembler::AVX_128bit;
  44     case 32: return Assembler::AVX_256bit;
  45     case 64: return Assembler::AVX_512bit;
  46 
  47     default: {
  48       ShouldNotReachHere();
  49       return Assembler::AVX_NoVec;
  50     }
  51   }
  52 }
  53 
  54 void C2_MacroAssembler::setvectmask(Register dst, Register src, KRegister mask) {
  55   guarantee(PostLoopMultiversioning, "must be");
  56   Assembler::movl(dst, 1);
  57   Assembler::shlxl(dst, dst, src);
  58   Assembler::decl(dst);
  59   Assembler::kmovdl(mask, dst);
  60   Assembler::movl(dst, src);
  61 }
  62 
  63 void C2_MacroAssembler::restorevectmask(KRegister mask) {
  64   guarantee(PostLoopMultiversioning, "must be");
  65   Assembler::knotwl(mask, k0);
  66 }
  67 
  68 #if INCLUDE_RTM_OPT
  69 
  70 // Update rtm_counters based on abort status
  71 // input: abort_status
  72 //        rtm_counters (RTMLockingCounters*)
  73 // flags are killed
  74 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
  75 
  76   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
  77   if (PrintPreciseRTMLockingStatistics) {
  78     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
  79       Label check_abort;
  80       testl(abort_status, (1<<i));
  81       jccb(Assembler::equal, check_abort);
  82       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
  83       bind(check_abort);
  84     }
  85   }
  86 }
  87 
  88 // Branch if (random & (count-1) != 0), count is 2^n
  89 // tmp, scr and flags are killed
  90 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
  91   assert(tmp == rax, "");
  92   assert(scr == rdx, "");
  93   rdtsc(); // modifies EDX:EAX
  94   andptr(tmp, count-1);
  95   jccb(Assembler::notZero, brLabel);
  96 }
  97 
  98 // Perform abort ratio calculation, set no_rtm bit if high ratio
  99 // input:  rtm_counters_Reg (RTMLockingCounters* address)
 100 // tmpReg, rtm_counters_Reg and flags are killed
 101 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
 102                                                     Register rtm_counters_Reg,
 103                                                     RTMLockingCounters* rtm_counters,
 104                                                     Metadata* method_data) {
 105   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 106 
 107   if (RTMLockingCalculationDelay > 0) {
 108     // Delay calculation
 109     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
 110     testptr(tmpReg, tmpReg);
 111     jccb(Assembler::equal, L_done);
 112   }
 113   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 114   //   Aborted transactions = abort_count * 100
 115   //   All transactions = total_count *  RTMTotalCountIncrRate
 116   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 117 
 118   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 119   cmpptr(tmpReg, RTMAbortThreshold);
 120   jccb(Assembler::below, L_check_always_rtm2);
 121   imulptr(tmpReg, tmpReg, 100);
 122 
 123   Register scrReg = rtm_counters_Reg;
 124   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 125   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 126   imulptr(scrReg, scrReg, RTMAbortRatio);
 127   cmpptr(tmpReg, scrReg);
 128   jccb(Assembler::below, L_check_always_rtm1);
 129   if (method_data != NULL) {
 130     // set rtm_state to "no rtm" in MDO
 131     mov_metadata(tmpReg, method_data);
 132     lock();
 133     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
 134   }
 135   jmpb(L_done);
 136   bind(L_check_always_rtm1);
 137   // Reload RTMLockingCounters* address
 138   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 139   bind(L_check_always_rtm2);
 140   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 141   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 142   jccb(Assembler::below, L_done);
 143   if (method_data != NULL) {
 144     // set rtm_state to "always rtm" in MDO
 145     mov_metadata(tmpReg, method_data);
 146     lock();
 147     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
 148   }
 149   bind(L_done);
 150 }
 151 
 152 // Update counters and perform abort ratio calculation
 153 // input:  abort_status_Reg
 154 // rtm_counters_Reg, flags are killed
 155 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 156                                       Register rtm_counters_Reg,
 157                                       RTMLockingCounters* rtm_counters,
 158                                       Metadata* method_data,
 159                                       bool profile_rtm) {
 160 
 161   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 162   // update rtm counters based on rax value at abort
 163   // reads abort_status_Reg, updates flags
 164   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 165   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 166   if (profile_rtm) {
 167     // Save abort status because abort_status_Reg is used by following code.
 168     if (RTMRetryCount > 0) {
 169       push(abort_status_Reg);
 170     }
 171     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 172     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 173     // restore abort status
 174     if (RTMRetryCount > 0) {
 175       pop(abort_status_Reg);
 176     }
 177   }
 178 }
 179 
 180 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 181 // inputs: retry_count_Reg
 182 //       : abort_status_Reg
 183 // output: retry_count_Reg decremented by 1
 184 // flags are killed
 185 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 186   Label doneRetry;
 187   assert(abort_status_Reg == rax, "");
 188   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 189   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 190   // if reason is in 0x6 and retry count != 0 then retry
 191   andptr(abort_status_Reg, 0x6);
 192   jccb(Assembler::zero, doneRetry);
 193   testl(retry_count_Reg, retry_count_Reg);
 194   jccb(Assembler::zero, doneRetry);
 195   pause();
 196   decrementl(retry_count_Reg);
 197   jmp(retryLabel);
 198   bind(doneRetry);
 199 }
 200 
 201 // Spin and retry if lock is busy,
 202 // inputs: box_Reg (monitor address)
 203 //       : retry_count_Reg
 204 // output: retry_count_Reg decremented by 1
 205 //       : clear z flag if retry count exceeded
 206 // tmp_Reg, scr_Reg, flags are killed
 207 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 208                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 209   Label SpinLoop, SpinExit, doneRetry;
 210   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 211 
 212   testl(retry_count_Reg, retry_count_Reg);
 213   jccb(Assembler::zero, doneRetry);
 214   decrementl(retry_count_Reg);
 215   movptr(scr_Reg, RTMSpinLoopCount);
 216 
 217   bind(SpinLoop);
 218   pause();
 219   decrementl(scr_Reg);
 220   jccb(Assembler::lessEqual, SpinExit);
 221   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 222   testptr(tmp_Reg, tmp_Reg);
 223   jccb(Assembler::notZero, SpinLoop);
 224 
 225   bind(SpinExit);
 226   jmp(retryLabel);
 227   bind(doneRetry);
 228   incrementl(retry_count_Reg); // clear z flag
 229 }
 230 
 231 // Use RTM for normal stack locks
 232 // Input: objReg (object to lock)
 233 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 234                                          Register retry_on_abort_count_Reg,
 235                                          RTMLockingCounters* stack_rtm_counters,
 236                                          Metadata* method_data, bool profile_rtm,
 237                                          Label& DONE_LABEL, Label& IsInflated) {
 238   assert(UseRTMForStackLocks, "why call this otherwise?");
 239   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
 240   assert(tmpReg == rax, "");
 241   assert(scrReg == rdx, "");
 242   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 243 
 244   if (RTMRetryCount > 0) {
 245     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 246     bind(L_rtm_retry);
 247   }
 248   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 249   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral|biased
 250   jcc(Assembler::notZero, IsInflated);
 251 
 252   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 253     Label L_noincrement;
 254     if (RTMTotalCountIncrRate > 1) {
 255       // tmpReg, scrReg and flags are killed
 256       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 257     }
 258     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
 259     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 260     bind(L_noincrement);
 261   }
 262   xbegin(L_on_abort);
 263   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 264   andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits
 265   cmpptr(tmpReg, markWord::unlocked_value);            // bits = 001 unlocked
 266   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 267 
 268   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 269   if (UseRTMXendForLockBusy) {
 270     xend();
 271     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 272     jmp(L_decrement_retry);
 273   }
 274   else {
 275     xabort(0);
 276   }
 277   bind(L_on_abort);
 278   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 279     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 280   }
 281   bind(L_decrement_retry);
 282   if (RTMRetryCount > 0) {
 283     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 284     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 285   }
 286 }
 287 
 288 // Use RTM for inflating locks
 289 // inputs: objReg (object to lock)
 290 //         boxReg (on-stack box address (displaced header location) - KILLED)
 291 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 292 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 293                                             Register scrReg, Register retry_on_busy_count_Reg,
 294                                             Register retry_on_abort_count_Reg,
 295                                             RTMLockingCounters* rtm_counters,
 296                                             Metadata* method_data, bool profile_rtm,
 297                                             Label& DONE_LABEL) {
 298   assert(UseRTMLocking, "why call this otherwise?");
 299   assert(tmpReg == rax, "");
 300   assert(scrReg == rdx, "");
 301   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 302   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 303 
 304   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 305   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
 306   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 307 
 308   if (RTMRetryCount > 0) {
 309     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 310     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 311     bind(L_rtm_retry);
 312   }
 313   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 314     Label L_noincrement;
 315     if (RTMTotalCountIncrRate > 1) {
 316       // tmpReg, scrReg and flags are killed
 317       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 318     }
 319     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 320     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 321     bind(L_noincrement);
 322   }
 323   xbegin(L_on_abort);
 324   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 325   movptr(tmpReg, Address(tmpReg, owner_offset));
 326   testptr(tmpReg, tmpReg);
 327   jcc(Assembler::zero, DONE_LABEL);
 328   if (UseRTMXendForLockBusy) {
 329     xend();
 330     jmp(L_decrement_retry);
 331   }
 332   else {
 333     xabort(0);
 334   }
 335   bind(L_on_abort);
 336   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 337   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 338     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 339   }
 340   if (RTMRetryCount > 0) {
 341     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 342     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 343   }
 344 
 345   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 346   testptr(tmpReg, tmpReg) ;
 347   jccb(Assembler::notZero, L_decrement_retry) ;
 348 
 349   // Appears unlocked - try to swing _owner from null to non-null.
 350   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 351 #ifdef _LP64
 352   Register threadReg = r15_thread;
 353 #else
 354   get_thread(scrReg);
 355   Register threadReg = scrReg;
 356 #endif
 357   lock();
 358   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 359 
 360   if (RTMRetryCount > 0) {
 361     // success done else retry
 362     jccb(Assembler::equal, DONE_LABEL) ;
 363     bind(L_decrement_retry);
 364     // Spin and retry if lock is busy.
 365     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 366   }
 367   else {
 368     bind(L_decrement_retry);
 369   }
 370 }
 371 
 372 #endif //  INCLUDE_RTM_OPT
 373 
 374 // fast_lock and fast_unlock used by C2
 375 
 376 // Because the transitions from emitted code to the runtime
 377 // monitorenter/exit helper stubs are so slow it's critical that
 378 // we inline both the stack-locking fast path and the inflated fast path.
 379 //
 380 // See also: cmpFastLock and cmpFastUnlock.
 381 //
 382 // What follows is a specialized inline transliteration of the code
 383 // in enter() and exit(). If we're concerned about I$ bloat another
 384 // option would be to emit TrySlowEnter and TrySlowExit methods
 385 // at startup-time.  These methods would accept arguments as
 386 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 387 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 388 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 389 // In practice, however, the # of lock sites is bounded and is usually small.
 390 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 391 // if the processor uses simple bimodal branch predictors keyed by EIP
 392 // Since the helper routines would be called from multiple synchronization
 393 // sites.
 394 //
 395 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 396 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 397 // to those specialized methods.  That'd give us a mostly platform-independent
 398 // implementation that the JITs could optimize and inline at their pleasure.
 399 // Done correctly, the only time we'd need to cross to native could would be
 400 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 401 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 402 // (b) explicit barriers or fence operations.
 403 //
 404 // TODO:
 405 //
 406 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 407 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 408 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 409 //    the lock operators would typically be faster than reifying Self.
 410 //
 411 // *  Ideally I'd define the primitives as:
 412 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 413 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 414 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 415 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 416 //    Furthermore the register assignments are overconstrained, possibly resulting in
 417 //    sub-optimal code near the synchronization site.
 418 //
 419 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 420 //    Alternately, use a better sp-proximity test.
 421 //
 422 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 423 //    Either one is sufficient to uniquely identify a thread.
 424 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 425 //
 426 // *  Intrinsify notify() and notifyAll() for the common cases where the
 427 //    object is locked by the calling thread but the waitlist is empty.
 428 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 429 //
 430 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 431 //    But beware of excessive branch density on AMD Opterons.
 432 //
 433 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 434 //    or failure of the fast path.  If the fast path fails then we pass
 435 //    control to the slow path, typically in C.  In fast_lock and
 436 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 437 //    will emit a conditional branch immediately after the node.
 438 //    So we have branches to branches and lots of ICC.ZF games.
 439 //    Instead, it might be better to have C2 pass a "FailureLabel"
 440 //    into fast_lock and fast_unlock.  In the case of success, control
 441 //    will drop through the node.  ICC.ZF is undefined at exit.
 442 //    In the case of failure, the node will branch directly to the
 443 //    FailureLabel
 444 
 445 
 446 // obj: object to lock
 447 // box: on-stack box address (displaced header location) - KILLED
 448 // rax,: tmp -- KILLED
 449 // scr: tmp -- KILLED
 450 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 451                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 452                                  BiasedLockingCounters* counters,
 453                                  RTMLockingCounters* rtm_counters,
 454                                  RTMLockingCounters* stack_rtm_counters,
 455                                  Metadata* method_data,
 456                                  bool use_rtm, bool profile_rtm) {
 457   // Ensure the register assignments are disjoint
 458   assert(tmpReg == rax, "");
 459 
 460   if (use_rtm) {
 461     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 462   } else {
 463     assert(cx2Reg == noreg, "");
 464     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 465   }
 466 
 467   if (counters != NULL) {
 468     atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
 469   }
 470 
 471   // Possible cases that we'll encounter in fast_lock
 472   // ------------------------------------------------
 473   // * Inflated
 474   //    -- unlocked
 475   //    -- Locked
 476   //       = by self
 477   //       = by other
 478   // * biased
 479   //    -- by Self
 480   //    -- by other
 481   // * neutral
 482   // * stack-locked
 483   //    -- by self
 484   //       = sp-proximity test hits
 485   //       = sp-proximity test generates false-negative
 486   //    -- by other
 487   //
 488 
 489   Label IsInflated, DONE_LABEL;
 490 
 491   if (DiagnoseSyncOnValueBasedClasses != 0) {
 492     load_klass(tmpReg, objReg, cx1Reg);
 493     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 494     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 495     jcc(Assembler::notZero, DONE_LABEL);
 496   }
 497 
 498   // it's stack-locked, biased or neutral
 499   // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
 500   // order to reduce the number of conditional branches in the most common cases.
 501   // Beware -- there's a subtle invariant that fetch of the markword
 502   // at [FETCH], below, will never observe a biased encoding (*101b).
 503   // If this invariant is not held we risk exclusion (safety) failure.
 504   if (UseBiasedLocking && !UseOptoBiasInlining) {
 505     biased_locking_enter(boxReg, objReg, tmpReg, scrReg, cx1Reg, false, DONE_LABEL, NULL, counters);
 506   }
 507 
 508 #if INCLUDE_RTM_OPT
 509   if (UseRTMForStackLocks && use_rtm) {
 510     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 511                       stack_rtm_counters, method_data, profile_rtm,
 512                       DONE_LABEL, IsInflated);
 513   }
 514 #endif // INCLUDE_RTM_OPT
 515 
 516   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 517   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased
 518   jcc(Assembler::notZero, IsInflated);
 519 
 520   if (LockingMode == LM_MONITOR) {
 521     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 522     testptr(objReg, objReg);
 523   } else if (LockingMode == LM_LEGACY) {
 524     // Attempt stack-locking ...
 525     orptr (tmpReg, markWord::unlocked_value);
 526     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 527     lock();
 528     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 529     if (counters != NULL) {
 530       cond_inc32(Assembler::equal,
 531                  ExternalAddress((address)counters->fast_path_entry_count_addr()));
 532     }
 533     jcc(Assembler::equal, DONE_LABEL);           // Success
 534 
 535     // Recursive locking.
 536     // The object is stack-locked: markword contains stack pointer to BasicLock.
 537     // Locked by current thread if difference with current SP is less than one page.
 538     subptr(tmpReg, rsp);
 539     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 540     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
 541     movptr(Address(boxReg, 0), tmpReg);
 542     if (counters != NULL) {
 543       cond_inc32(Assembler::equal,
 544                  ExternalAddress((address)counters->fast_path_entry_count_addr()));
 545     }
 546   } else {
 547     assert(LockingMode == LM_LIGHTWEIGHT, "");
 548     fast_lock_impl(objReg, tmpReg, thread, scrReg, DONE_LABEL);
 549     xorl(tmpReg, tmpReg); // Set ZF=1 to indicate success
 550   }
 551   jmp(DONE_LABEL);
 552 
 553   bind(IsInflated);
 554   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 555 
 556 #if INCLUDE_RTM_OPT
 557   // Use the same RTM locking code in 32- and 64-bit VM.
 558   if (use_rtm) {
 559     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 560                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 561   } else {
 562 #endif // INCLUDE_RTM_OPT
 563 
 564 #ifndef _LP64
 565   // The object is inflated.
 566 
 567   // boxReg refers to the on-stack BasicLock in the current frame.
 568   // We'd like to write:
 569   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 570   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 571   // additional latency as we have another ST in the store buffer that must drain.
 572 
 573   // avoid ST-before-CAS
 574   // register juggle because we need tmpReg for cmpxchgptr below
 575   movptr(scrReg, boxReg);
 576   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 577 
 578   // Optimistic form: consider XORL tmpReg,tmpReg
 579   movptr(tmpReg, NULL_WORD);
 580 
 581   // Appears unlocked - try to swing _owner from null to non-null.
 582   // Ideally, I'd manifest "Self" with get_thread and then attempt
 583   // to CAS the register containing Self into m->Owner.
 584   // But we don't have enough registers, so instead we can either try to CAS
 585   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 586   // we later store "Self" into m->Owner.  Transiently storing a stack address
 587   // (rsp or the address of the box) into  m->owner is harmless.
 588   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 589   lock();
 590   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 591   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 592   // If we weren't able to swing _owner from NULL to the BasicLock
 593   // then take the slow path.
 594   jccb  (Assembler::notZero, DONE_LABEL);
 595   // update _owner from BasicLock to thread
 596   get_thread (scrReg);                    // beware: clobbers ICCs
 597   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 598   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 599 
 600   // If the CAS fails we can either retry or pass control to the slow path.
 601   // We use the latter tactic.
 602   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 603   // If the CAS was successful ...
 604   //   Self has acquired the lock
 605   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 606   // Intentional fall-through into DONE_LABEL ...
 607 #else // _LP64
 608   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 609   movq(scrReg, tmpReg);
 610   xorq(tmpReg, tmpReg);
 611   lock();
 612   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 613   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 614   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 615   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
 616   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 617   jcc(Assembler::equal, DONE_LABEL);           // CAS above succeeded; propagate ZF = 1 (success)
 618 
 619   cmpptr(r15_thread, rax);                     // Check if we are already the owner (recursive lock)
 620   jcc(Assembler::notEqual, DONE_LABEL);        // If not recursive, ZF = 0 at this point (fail)
 621   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 622   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 623 #endif // _LP64
 624 #if INCLUDE_RTM_OPT
 625   } // use_rtm()
 626 #endif
 627   // DONE_LABEL is a hot target - we'd really like to place it at the
 628   // start of cache line by padding with NOPs.
 629   // See the AMD and Intel software optimization manuals for the
 630   // most efficient "long" NOP encodings.
 631   // Unfortunately none of our alignment mechanisms suffice.
 632   bind(DONE_LABEL);
 633 
 634   // At DONE_LABEL the icc ZFlag is set as follows ...
 635   // fast_unlock uses the same protocol.
 636   // ZFlag == 1 -> Success
 637   // ZFlag == 0 -> Failure - force control through the slow path
 638 }
 639 
 640 // obj: object to unlock
 641 // box: box address (displaced header location), killed.  Must be EAX.
 642 // tmp: killed, cannot be obj nor box.
 643 //
 644 // Some commentary on balanced locking:
 645 //
 646 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 647 // Methods that don't have provably balanced locking are forced to run in the
 648 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 649 // The interpreter provides two properties:
 650 // I1:  At return-time the interpreter automatically and quietly unlocks any
 651 //      objects acquired the current activation (frame).  Recall that the
 652 //      interpreter maintains an on-stack list of locks currently held by
 653 //      a frame.
 654 // I2:  If a method attempts to unlock an object that is not held by the
 655 //      the frame the interpreter throws IMSX.
 656 //
 657 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 658 // B() doesn't have provably balanced locking so it runs in the interpreter.
 659 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 660 // is still locked by A().
 661 //
 662 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 663 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 664 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 665 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 666 // Arguably given that the spec legislates the JNI case as undefined our implementation
 667 // could reasonably *avoid* checking owner in fast_unlock().
 668 // In the interest of performance we elide m->Owner==Self check in unlock.
 669 // A perfectly viable alternative is to elide the owner check except when
 670 // Xcheck:jni is enabled.
 671 
 672 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 673   assert(boxReg == rax, "");
 674   assert_different_registers(objReg, boxReg, tmpReg);
 675 
 676   Label DONE_LABEL, Stacked, CheckSucc;
 677 
 678   // Critically, the biased locking test must have precedence over
 679   // and appear before the (box->dhw == 0) recursive stack-lock test.
 680   if (UseBiasedLocking && !UseOptoBiasInlining) {
 681     biased_locking_exit(objReg, tmpReg, DONE_LABEL);
 682   }
 683 
 684 #if INCLUDE_RTM_OPT
 685   if (UseRTMForStackLocks && use_rtm) {
 686     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
 687     Label L_regular_unlock;
 688     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 689     andptr(tmpReg, markWord::biased_lock_mask_in_place);              // look at 3 lock bits
 690     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 001 unlocked
 691     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 692     xend();                                                           // otherwise end...
 693     jmp(DONE_LABEL);                                                  // ... and we're done
 694     bind(L_regular_unlock);
 695   }
 696 #endif
 697 
 698   if (LockingMode == LM_LEGACY) {
 699     cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD);                   // Examine the displaced header
 700     jcc   (Assembler::zero, DONE_LABEL);                              // 0 indicates recursive stack-lock
 701   }
 702   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
 703   if (LockingMode != LM_MONITOR) {
 704     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 705     jcc(Assembler::zero, Stacked);
 706   }
 707 
 708   // It's inflated.
 709   if (LockingMode == LM_LIGHTWEIGHT) {
 710     // If the owner is ANONYMOUS, we need to fix it.
 711     testb(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t) ObjectMonitor::ANONYMOUS_OWNER);
 712 #ifdef _LP64
 713     if (!Compile::current()->output()->in_scratch_emit_size()) {
 714       C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmpReg, boxReg);
 715       Compile::current()->output()->add_stub(stub);
 716       jcc(Assembler::notEqual, stub->entry());
 717       bind(stub->continuation());
 718     } else
 719 #endif
 720     {
 721       // We can't easily implement this optimization on 32 bit because we don't have a thread register.
 722       // Call the slow-path instead.
 723       jcc(Assembler::notEqual, DONE_LABEL);
 724     }
 725   }
 726 
 727 #if INCLUDE_RTM_OPT
 728   if (use_rtm) {
 729     Label L_regular_inflated_unlock;
 730     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 731     movptr(boxReg, Address(tmpReg, owner_offset));
 732     testptr(boxReg, boxReg);
 733     jccb(Assembler::notZero, L_regular_inflated_unlock);
 734     xend();
 735     jmp(DONE_LABEL);
 736     bind(L_regular_inflated_unlock);
 737   }
 738 #endif
 739 
 740   // Despite our balanced locking property we still check that m->_owner == Self
 741   // as java routines or native JNI code called by this thread might
 742   // have released the lock.
 743   // Refer to the comments in synchronizer.cpp for how we might encode extra
 744   // state in _succ so we can avoid fetching EntryList|cxq.
 745   //
 746   // If there's no contention try a 1-0 exit.  That is, exit without
 747   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 748   // we detect and recover from the race that the 1-0 exit admits.
 749   //
 750   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 751   // before it STs null into _owner, releasing the lock.  Updates
 752   // to data protected by the critical section must be visible before
 753   // we drop the lock (and thus before any other thread could acquire
 754   // the lock and observe the fields protected by the lock).
 755   // IA32's memory-model is SPO, so STs are ordered with respect to
 756   // each other and there's no need for an explicit barrier (fence).
 757   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 758 #ifndef _LP64
 759   get_thread (boxReg);
 760 
 761   // Note that we could employ various encoding schemes to reduce
 762   // the number of loads below (currently 4) to just 2 or 3.
 763   // Refer to the comments in synchronizer.cpp.
 764   // In practice the chain of fetches doesn't seem to impact performance, however.
 765   xorptr(boxReg, boxReg);
 766   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 767   jccb  (Assembler::notZero, DONE_LABEL);
 768   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 769   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 770   jccb  (Assembler::notZero, DONE_LABEL);
 771   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 772   jmpb  (DONE_LABEL);
 773 
 774   // Intention fall-thru into DONE_LABEL
 775 
 776   // DONE_LABEL is a hot target - we'd really like to place it at the
 777   // start of cache line by padding with NOPs.
 778   // See the AMD and Intel software optimization manuals for the
 779   // most efficient "long" NOP encodings.
 780   // Unfortunately none of our alignment mechanisms suffice.
 781   bind (CheckSucc);
 782 #else // _LP64
 783   // It's inflated
 784   Label LNotRecursive, LSuccess, LGoSlowPath;
 785 
 786   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 787   jccb(Assembler::equal, LNotRecursive);
 788 
 789   // Recursive inflated unlock
 790   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 791   jmpb(LSuccess);
 792 
 793   bind(LNotRecursive);
 794   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 795   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 796   jccb  (Assembler::notZero, CheckSucc);
 797   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 798   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 799   jmpb  (DONE_LABEL);
 800 
 801   // Try to avoid passing control into the slow_path ...
 802   bind  (CheckSucc);
 803 
 804   // The following optional optimization can be elided if necessary
 805   // Effectively: if (succ == null) goto slow path
 806   // The code reduces the window for a race, however,
 807   // and thus benefits performance.
 808   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 809   jccb  (Assembler::zero, LGoSlowPath);
 810 
 811   xorptr(boxReg, boxReg);
 812   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 813   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 814 
 815   // Memory barrier/fence
 816   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 817   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 818   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 819   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 820   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 821   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 822   lock(); addl(Address(rsp, 0), 0);
 823 
 824   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 825   jccb  (Assembler::notZero, LSuccess);
 826 
 827   // Rare inopportune interleaving - race.
 828   // The successor vanished in the small window above.
 829   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 830   // We need to ensure progress and succession.
 831   // Try to reacquire the lock.
 832   // If that fails then the new owner is responsible for succession and this
 833   // thread needs to take no further action and can exit via the fast path (success).
 834   // If the re-acquire succeeds then pass control into the slow path.
 835   // As implemented, this latter mode is horrible because we generated more
 836   // coherence traffic on the lock *and* artifically extended the critical section
 837   // length while by virtue of passing control into the slow path.
 838 
 839   // box is really RAX -- the following CMPXCHG depends on that binding
 840   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 841   lock();
 842   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 843   // There's no successor so we tried to regrab the lock.
 844   // If that didn't work, then another thread grabbed the
 845   // lock so we're done (and exit was a success).
 846   jccb  (Assembler::notEqual, LSuccess);
 847   // Intentional fall-through into slow path
 848 
 849   bind  (LGoSlowPath);
 850   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 851   jmpb  (DONE_LABEL);
 852 
 853   bind  (LSuccess);
 854   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 855   jmpb  (DONE_LABEL);
 856 
 857 #endif
 858   if (LockingMode != LM_MONITOR) {
 859     bind  (Stacked);
 860     if (LockingMode == LM_LIGHTWEIGHT) {
 861       mov(boxReg, tmpReg);
 862       fast_unlock_impl(objReg, boxReg, tmpReg, DONE_LABEL);
 863       xorl(tmpReg, tmpReg);
 864     } else if (LockingMode == LM_LEGACY) {
 865       movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 866       lock();
 867       cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 868     }
 869   }
 870   bind(DONE_LABEL);
 871 }
 872 
 873 //-------------------------------------------------------------------------------------------
 874 // Generic instructions support for use in .ad files C2 code generation
 875 
 876 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
 877   if (dst != src) {
 878     movdqu(dst, src);
 879   }
 880   if (opcode == Op_AbsVD) {
 881     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr);
 882   } else {
 883     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 884     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr);
 885   }
 886 }
 887 
 888 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
 889   if (opcode == Op_AbsVD) {
 890     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr);
 891   } else {
 892     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 893     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr);
 894   }
 895 }
 896 
 897 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
 898   if (dst != src) {
 899     movdqu(dst, src);
 900   }
 901   if (opcode == Op_AbsVF) {
 902     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);
 903   } else {
 904     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 905     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);
 906   }
 907 }
 908 
 909 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
 910   if (opcode == Op_AbsVF) {
 911     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);
 912   } else {
 913     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 914     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
 915   }
 916 }
 917 
 918 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 919   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 920   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 921 
 922   if (opcode == Op_MinV) {
 923     if (elem_bt == T_BYTE) {
 924       pminsb(dst, src);
 925     } else if (elem_bt == T_SHORT) {
 926       pminsw(dst, src);
 927     } else if (elem_bt == T_INT) {
 928       pminsd(dst, src);
 929     } else {
 930       assert(elem_bt == T_LONG, "required");
 931       assert(tmp == xmm0, "required");
 932       assert_different_registers(dst, src, tmp);
 933       movdqu(xmm0, dst);
 934       pcmpgtq(xmm0, src);
 935       blendvpd(dst, src);  // xmm0 as mask
 936     }
 937   } else { // opcode == Op_MaxV
 938     if (elem_bt == T_BYTE) {
 939       pmaxsb(dst, src);
 940     } else if (elem_bt == T_SHORT) {
 941       pmaxsw(dst, src);
 942     } else if (elem_bt == T_INT) {
 943       pmaxsd(dst, src);
 944     } else {
 945       assert(elem_bt == T_LONG, "required");
 946       assert(tmp == xmm0, "required");
 947       assert_different_registers(dst, src, tmp);
 948       movdqu(xmm0, src);
 949       pcmpgtq(xmm0, dst);
 950       blendvpd(dst, src);  // xmm0 as mask
 951     }
 952   }
 953 }
 954 
 955 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 956                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 957                                  int vlen_enc) {
 958   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 959 
 960   if (opcode == Op_MinV) {
 961     if (elem_bt == T_BYTE) {
 962       vpminsb(dst, src1, src2, vlen_enc);
 963     } else if (elem_bt == T_SHORT) {
 964       vpminsw(dst, src1, src2, vlen_enc);
 965     } else if (elem_bt == T_INT) {
 966       vpminsd(dst, src1, src2, vlen_enc);
 967     } else {
 968       assert(elem_bt == T_LONG, "required");
 969       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 970         vpminsq(dst, src1, src2, vlen_enc);
 971       } else {
 972         assert_different_registers(dst, src1, src2);
 973         vpcmpgtq(dst, src1, src2, vlen_enc);
 974         vblendvpd(dst, src1, src2, dst, vlen_enc);
 975       }
 976     }
 977   } else { // opcode == Op_MaxV
 978     if (elem_bt == T_BYTE) {
 979       vpmaxsb(dst, src1, src2, vlen_enc);
 980     } else if (elem_bt == T_SHORT) {
 981       vpmaxsw(dst, src1, src2, vlen_enc);
 982     } else if (elem_bt == T_INT) {
 983       vpmaxsd(dst, src1, src2, vlen_enc);
 984     } else {
 985       assert(elem_bt == T_LONG, "required");
 986       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 987         vpmaxsq(dst, src1, src2, vlen_enc);
 988       } else {
 989         assert_different_registers(dst, src1, src2);
 990         vpcmpgtq(dst, src1, src2, vlen_enc);
 991         vblendvpd(dst, src2, src1, dst, vlen_enc);
 992       }
 993     }
 994   }
 995 }
 996 
 997 // Float/Double min max
 998 
 999 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1000                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1001                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1002                                    int vlen_enc) {
1003   assert(UseAVX > 0, "required");
1004   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1005          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1006   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1007   assert_different_registers(a, b, tmp, atmp, btmp);
1008 
1009   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1010   bool is_double_word = is_double_word_type(elem_bt);
1011 
1012   if (!is_double_word && is_min) {
1013     vblendvps(atmp, a, b, a, vlen_enc);
1014     vblendvps(btmp, b, a, a, vlen_enc);
1015     vminps(tmp, atmp, btmp, vlen_enc);
1016     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1017     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
1018   } else if (!is_double_word && !is_min) {
1019     vblendvps(btmp, b, a, b, vlen_enc);
1020     vblendvps(atmp, a, b, b, vlen_enc);
1021     vmaxps(tmp, atmp, btmp, vlen_enc);
1022     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1023     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
1024   } else if (is_double_word && is_min) {
1025     vblendvpd(atmp, a, b, a, vlen_enc);
1026     vblendvpd(btmp, b, a, a, vlen_enc);
1027     vminpd(tmp, atmp, btmp, vlen_enc);
1028     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1029     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1030   } else {
1031     assert(is_double_word && !is_min, "sanity");
1032     vblendvpd(btmp, b, a, b, vlen_enc);
1033     vblendvpd(atmp, a, b, b, vlen_enc);
1034     vmaxpd(tmp, atmp, btmp, vlen_enc);
1035     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1036     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1037   }
1038 }
1039 
1040 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1041                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1042                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1043                                     int vlen_enc) {
1044   assert(UseAVX > 2, "required");
1045   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1046          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1047   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1048   assert_different_registers(dst, a, b, atmp, btmp);
1049 
1050   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1051   bool is_double_word = is_double_word_type(elem_bt);
1052   bool merge = true;
1053 
1054   if (!is_double_word && is_min) {
1055     evpmovd2m(ktmp, a, vlen_enc);
1056     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1057     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1058     vminps(dst, atmp, btmp, vlen_enc);
1059     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1060     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1061   } else if (!is_double_word && !is_min) {
1062     evpmovd2m(ktmp, b, vlen_enc);
1063     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1064     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1065     vmaxps(dst, atmp, btmp, vlen_enc);
1066     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1067     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1068   } else if (is_double_word && is_min) {
1069     evpmovq2m(ktmp, a, vlen_enc);
1070     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1071     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1072     vminpd(dst, atmp, btmp, vlen_enc);
1073     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1074     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1075   } else {
1076     assert(is_double_word && !is_min, "sanity");
1077     evpmovq2m(ktmp, b, vlen_enc);
1078     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1079     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1080     vmaxpd(dst, atmp, btmp, vlen_enc);
1081     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1082     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1083   }
1084 }
1085 
1086 // Float/Double signum
1087 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst,
1088                                   XMMRegister zero, XMMRegister one,
1089                                   Register scratch) {
1090   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1091 
1092   Label DONE_LABEL;
1093 
1094   if (opcode == Op_SignumF) {
1095     assert(UseSSE > 0, "required");
1096     ucomiss(dst, zero);
1097     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1098     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1099     movflt(dst, one);
1100     jcc(Assembler::above, DONE_LABEL);
1101     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scratch);
1102   } else if (opcode == Op_SignumD) {
1103     assert(UseSSE > 1, "required");
1104     ucomisd(dst, zero);
1105     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1106     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1107     movdbl(dst, one);
1108     jcc(Assembler::above, DONE_LABEL);
1109     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scratch);
1110   }
1111 
1112   bind(DONE_LABEL);
1113 }
1114 
1115 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1116   if (sign) {
1117     pmovsxbw(dst, src);
1118   } else {
1119     pmovzxbw(dst, src);
1120   }
1121 }
1122 
1123 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1124   if (sign) {
1125     vpmovsxbw(dst, src, vector_len);
1126   } else {
1127     vpmovzxbw(dst, src, vector_len);
1128   }
1129 }
1130 
1131 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1132   if (sign) {
1133     vpmovsxbd(dst, src, vector_len);
1134   } else {
1135     vpmovzxbd(dst, src, vector_len);
1136   }
1137 }
1138 
1139 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1140   if (sign) {
1141     vpmovsxwd(dst, src, vector_len);
1142   } else {
1143     vpmovzxwd(dst, src, vector_len);
1144   }
1145 }
1146 
1147 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1148                                      int shift, int vector_len) {
1149   if (opcode == Op_RotateLeftV) {
1150     if (etype == T_INT) {
1151       evprold(dst, src, shift, vector_len);
1152     } else {
1153       assert(etype == T_LONG, "expected type T_LONG");
1154       evprolq(dst, src, shift, vector_len);
1155     }
1156   } else {
1157     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1158     if (etype == T_INT) {
1159       evprord(dst, src, shift, vector_len);
1160     } else {
1161       assert(etype == T_LONG, "expected type T_LONG");
1162       evprorq(dst, src, shift, vector_len);
1163     }
1164   }
1165 }
1166 
1167 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1168                                      XMMRegister shift, int vector_len) {
1169   if (opcode == Op_RotateLeftV) {
1170     if (etype == T_INT) {
1171       evprolvd(dst, src, shift, vector_len);
1172     } else {
1173       assert(etype == T_LONG, "expected type T_LONG");
1174       evprolvq(dst, src, shift, vector_len);
1175     }
1176   } else {
1177     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1178     if (etype == T_INT) {
1179       evprorvd(dst, src, shift, vector_len);
1180     } else {
1181       assert(etype == T_LONG, "expected type T_LONG");
1182       evprorvq(dst, src, shift, vector_len);
1183     }
1184   }
1185 }
1186 
1187 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1188   if (opcode == Op_RShiftVI) {
1189     psrad(dst, shift);
1190   } else if (opcode == Op_LShiftVI) {
1191     pslld(dst, shift);
1192   } else {
1193     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1194     psrld(dst, shift);
1195   }
1196 }
1197 
1198 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1199   switch (opcode) {
1200     case Op_RShiftVI:  psrad(dst, shift); break;
1201     case Op_LShiftVI:  pslld(dst, shift); break;
1202     case Op_URShiftVI: psrld(dst, shift); break;
1203 
1204     default: assert(false, "%s", NodeClassNames[opcode]);
1205   }
1206 }
1207 
1208 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1209   if (opcode == Op_RShiftVI) {
1210     vpsrad(dst, nds, shift, vector_len);
1211   } else if (opcode == Op_LShiftVI) {
1212     vpslld(dst, nds, shift, vector_len);
1213   } else {
1214     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1215     vpsrld(dst, nds, shift, vector_len);
1216   }
1217 }
1218 
1219 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1220   switch (opcode) {
1221     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1222     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1223     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1224 
1225     default: assert(false, "%s", NodeClassNames[opcode]);
1226   }
1227 }
1228 
1229 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1230   switch (opcode) {
1231     case Op_RShiftVB:  // fall-through
1232     case Op_RShiftVS:  psraw(dst, shift); break;
1233 
1234     case Op_LShiftVB:  // fall-through
1235     case Op_LShiftVS:  psllw(dst, shift);   break;
1236 
1237     case Op_URShiftVS: // fall-through
1238     case Op_URShiftVB: psrlw(dst, shift);  break;
1239 
1240     default: assert(false, "%s", NodeClassNames[opcode]);
1241   }
1242 }
1243 
1244 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1245   switch (opcode) {
1246     case Op_RShiftVB:  // fall-through
1247     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1248 
1249     case Op_LShiftVB:  // fall-through
1250     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1251 
1252     case Op_URShiftVS: // fall-through
1253     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1254 
1255     default: assert(false, "%s", NodeClassNames[opcode]);
1256   }
1257 }
1258 
1259 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1260   switch (opcode) {
1261     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1262     case Op_LShiftVL:  psllq(dst, shift); break;
1263     case Op_URShiftVL: psrlq(dst, shift); break;
1264 
1265     default: assert(false, "%s", NodeClassNames[opcode]);
1266   }
1267 }
1268 
1269 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1270   if (opcode == Op_RShiftVL) {
1271     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1272   } else if (opcode == Op_LShiftVL) {
1273     psllq(dst, shift);
1274   } else {
1275     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1276     psrlq(dst, shift);
1277   }
1278 }
1279 
1280 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1281   switch (opcode) {
1282     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1283     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1284     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1285 
1286     default: assert(false, "%s", NodeClassNames[opcode]);
1287   }
1288 }
1289 
1290 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1291   if (opcode == Op_RShiftVL) {
1292     evpsraq(dst, nds, shift, vector_len);
1293   } else if (opcode == Op_LShiftVL) {
1294     vpsllq(dst, nds, shift, vector_len);
1295   } else {
1296     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1297     vpsrlq(dst, nds, shift, vector_len);
1298   }
1299 }
1300 
1301 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1302   switch (opcode) {
1303     case Op_RShiftVB:  // fall-through
1304     case Op_RShiftVS:  // fall-through
1305     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1306 
1307     case Op_LShiftVB:  // fall-through
1308     case Op_LShiftVS:  // fall-through
1309     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1310 
1311     case Op_URShiftVB: // fall-through
1312     case Op_URShiftVS: // fall-through
1313     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1314 
1315     default: assert(false, "%s", NodeClassNames[opcode]);
1316   }
1317 }
1318 
1319 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1320   switch (opcode) {
1321     case Op_RShiftVB:  // fall-through
1322     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1323 
1324     case Op_LShiftVB:  // fall-through
1325     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1326 
1327     case Op_URShiftVB: // fall-through
1328     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1329 
1330     default: assert(false, "%s", NodeClassNames[opcode]);
1331   }
1332 }
1333 
1334 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1335   assert(UseAVX >= 2, "required");
1336   switch (opcode) {
1337     case Op_RShiftVL: {
1338       if (UseAVX > 2) {
1339         assert(tmp == xnoreg, "not used");
1340         if (!VM_Version::supports_avx512vl()) {
1341           vlen_enc = Assembler::AVX_512bit;
1342         }
1343         evpsravq(dst, src, shift, vlen_enc);
1344       } else {
1345         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1346         vpsrlvq(dst, src, shift, vlen_enc);
1347         vpsrlvq(tmp, tmp, shift, vlen_enc);
1348         vpxor(dst, dst, tmp, vlen_enc);
1349         vpsubq(dst, dst, tmp, vlen_enc);
1350       }
1351       break;
1352     }
1353     case Op_LShiftVL: {
1354       assert(tmp == xnoreg, "not used");
1355       vpsllvq(dst, src, shift, vlen_enc);
1356       break;
1357     }
1358     case Op_URShiftVL: {
1359       assert(tmp == xnoreg, "not used");
1360       vpsrlvq(dst, src, shift, vlen_enc);
1361       break;
1362     }
1363     default: assert(false, "%s", NodeClassNames[opcode]);
1364   }
1365 }
1366 
1367 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1368 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1369   assert(opcode == Op_LShiftVB ||
1370          opcode == Op_RShiftVB ||
1371          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1372   bool sign = (opcode != Op_URShiftVB);
1373   assert(vector_len == 0, "required");
1374   vextendbd(sign, dst, src, 1);
1375   vpmovzxbd(vtmp, shift, 1);
1376   varshiftd(opcode, dst, dst, vtmp, 1);
1377   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch);
1378   vextracti128_high(vtmp, dst);
1379   vpackusdw(dst, dst, vtmp, 0);
1380 }
1381 
1382 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1383 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1384   assert(opcode == Op_LShiftVB ||
1385          opcode == Op_RShiftVB ||
1386          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1387   bool sign = (opcode != Op_URShiftVB);
1388   int ext_vector_len = vector_len + 1;
1389   vextendbw(sign, dst, src, ext_vector_len);
1390   vpmovzxbw(vtmp, shift, ext_vector_len);
1391   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1392   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch);
1393   if (vector_len == 0) {
1394     vextracti128_high(vtmp, dst);
1395     vpackuswb(dst, dst, vtmp, vector_len);
1396   } else {
1397     vextracti64x4_high(vtmp, dst);
1398     vpackuswb(dst, dst, vtmp, vector_len);
1399     vpermq(dst, dst, 0xD8, vector_len);
1400   }
1401 }
1402 
1403 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1404   switch(typ) {
1405     case T_BYTE:
1406       pinsrb(dst, val, idx);
1407       break;
1408     case T_SHORT:
1409       pinsrw(dst, val, idx);
1410       break;
1411     case T_INT:
1412       pinsrd(dst, val, idx);
1413       break;
1414     case T_LONG:
1415       pinsrq(dst, val, idx);
1416       break;
1417     default:
1418       assert(false,"Should not reach here.");
1419       break;
1420   }
1421 }
1422 
1423 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1424   switch(typ) {
1425     case T_BYTE:
1426       vpinsrb(dst, src, val, idx);
1427       break;
1428     case T_SHORT:
1429       vpinsrw(dst, src, val, idx);
1430       break;
1431     case T_INT:
1432       vpinsrd(dst, src, val, idx);
1433       break;
1434     case T_LONG:
1435       vpinsrq(dst, src, val, idx);
1436       break;
1437     default:
1438       assert(false,"Should not reach here.");
1439       break;
1440   }
1441 }
1442 
1443 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1444   switch(typ) {
1445     case T_INT:
1446       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1447       break;
1448     case T_FLOAT:
1449       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1450       break;
1451     case T_LONG:
1452       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1453       break;
1454     case T_DOUBLE:
1455       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1456       break;
1457     default:
1458       assert(false,"Should not reach here.");
1459       break;
1460   }
1461 }
1462 
1463 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1464   switch(typ) {
1465     case T_INT:
1466       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1467       break;
1468     case T_FLOAT:
1469       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1470       break;
1471     case T_LONG:
1472       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1473       break;
1474     case T_DOUBLE:
1475       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1476       break;
1477     default:
1478       assert(false,"Should not reach here.");
1479       break;
1480   }
1481 }
1482 
1483 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1484   switch(typ) {
1485     case T_INT:
1486       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1487       break;
1488     case T_FLOAT:
1489       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1490       break;
1491     case T_LONG:
1492       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1493       break;
1494     case T_DOUBLE:
1495       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1496       break;
1497     default:
1498       assert(false,"Should not reach here.");
1499       break;
1500   }
1501 }
1502 
1503 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1504   if (vlen_in_bytes <= 16) {
1505     pxor (dst, dst);
1506     psubb(dst, src);
1507     switch (elem_bt) {
1508       case T_BYTE:   /* nothing to do */ break;
1509       case T_SHORT:  pmovsxbw(dst, dst); break;
1510       case T_INT:    pmovsxbd(dst, dst); break;
1511       case T_FLOAT:  pmovsxbd(dst, dst); break;
1512       case T_LONG:   pmovsxbq(dst, dst); break;
1513       case T_DOUBLE: pmovsxbq(dst, dst); break;
1514 
1515       default: assert(false, "%s", type2name(elem_bt));
1516     }
1517   } else {
1518     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1519     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1520 
1521     vpxor (dst, dst, dst, vlen_enc);
1522     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1523 
1524     switch (elem_bt) {
1525       case T_BYTE:   /* nothing to do */            break;
1526       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1527       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1528       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1529       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1530       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1531 
1532       default: assert(false, "%s", type2name(elem_bt));
1533     }
1534   }
1535 }
1536 
1537 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) {
1538   ExternalAddress addr(StubRoutines::x86::vector_iota_indices());
1539   if (vlen_in_bytes == 4) {
1540     movdl(dst, addr);
1541   } else if (vlen_in_bytes == 8) {
1542     movq(dst, addr);
1543   } else if (vlen_in_bytes == 16) {
1544     movdqu(dst, addr, scratch);
1545   } else if (vlen_in_bytes == 32) {
1546     vmovdqu(dst, addr, scratch);
1547   } else {
1548     assert(vlen_in_bytes == 64, "%d", vlen_in_bytes);
1549     evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch);
1550   }
1551 }
1552 
1553 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1554 
1555 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1556   int vector_len = Assembler::AVX_128bit;
1557 
1558   switch (opcode) {
1559     case Op_AndReductionV:  pand(dst, src); break;
1560     case Op_OrReductionV:   por (dst, src); break;
1561     case Op_XorReductionV:  pxor(dst, src); break;
1562     case Op_MinReductionV:
1563       switch (typ) {
1564         case T_BYTE:        pminsb(dst, src); break;
1565         case T_SHORT:       pminsw(dst, src); break;
1566         case T_INT:         pminsd(dst, src); break;
1567         case T_LONG:        assert(UseAVX > 2, "required");
1568                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1569         default:            assert(false, "wrong type");
1570       }
1571       break;
1572     case Op_MaxReductionV:
1573       switch (typ) {
1574         case T_BYTE:        pmaxsb(dst, src); break;
1575         case T_SHORT:       pmaxsw(dst, src); break;
1576         case T_INT:         pmaxsd(dst, src); break;
1577         case T_LONG:        assert(UseAVX > 2, "required");
1578                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1579         default:            assert(false, "wrong type");
1580       }
1581       break;
1582     case Op_AddReductionVF: addss(dst, src); break;
1583     case Op_AddReductionVD: addsd(dst, src); break;
1584     case Op_AddReductionVI:
1585       switch (typ) {
1586         case T_BYTE:        paddb(dst, src); break;
1587         case T_SHORT:       paddw(dst, src); break;
1588         case T_INT:         paddd(dst, src); break;
1589         default:            assert(false, "wrong type");
1590       }
1591       break;
1592     case Op_AddReductionVL: paddq(dst, src); break;
1593     case Op_MulReductionVF: mulss(dst, src); break;
1594     case Op_MulReductionVD: mulsd(dst, src); break;
1595     case Op_MulReductionVI:
1596       switch (typ) {
1597         case T_SHORT:       pmullw(dst, src); break;
1598         case T_INT:         pmulld(dst, src); break;
1599         default:            assert(false, "wrong type");
1600       }
1601       break;
1602     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1603                             vpmullq(dst, dst, src, vector_len); break;
1604     default:                assert(false, "wrong opcode");
1605   }
1606 }
1607 
1608 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1609   int vector_len = Assembler::AVX_256bit;
1610 
1611   switch (opcode) {
1612     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1613     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1614     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1615     case Op_MinReductionV:
1616       switch (typ) {
1617         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1618         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1619         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1620         case T_LONG:        assert(UseAVX > 2, "required");
1621                             vpminsq(dst, src1, src2, vector_len); break;
1622         default:            assert(false, "wrong type");
1623       }
1624       break;
1625     case Op_MaxReductionV:
1626       switch (typ) {
1627         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1628         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1629         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1630         case T_LONG:        assert(UseAVX > 2, "required");
1631                             vpmaxsq(dst, src1, src2, vector_len); break;
1632         default:            assert(false, "wrong type");
1633       }
1634       break;
1635     case Op_AddReductionVI:
1636       switch (typ) {
1637         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1638         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1639         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1640         default:            assert(false, "wrong type");
1641       }
1642       break;
1643     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1644     case Op_MulReductionVI:
1645       switch (typ) {
1646         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1647         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1648         default:            assert(false, "wrong type");
1649       }
1650       break;
1651     case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
1652     default:                assert(false, "wrong opcode");
1653   }
1654 }
1655 
1656 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1657                                   XMMRegister dst, XMMRegister src,
1658                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1659   switch (opcode) {
1660     case Op_AddReductionVF:
1661     case Op_MulReductionVF:
1662       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1663       break;
1664 
1665     case Op_AddReductionVD:
1666     case Op_MulReductionVD:
1667       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1668       break;
1669 
1670     default: assert(false, "wrong opcode");
1671   }
1672 }
1673 
1674 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1675                              Register dst, Register src1, XMMRegister src2,
1676                              XMMRegister vtmp1, XMMRegister vtmp2) {
1677   switch (vlen) {
1678     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1679     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1680     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1681     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1682 
1683     default: assert(false, "wrong vector length");
1684   }
1685 }
1686 
1687 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1688                              Register dst, Register src1, XMMRegister src2,
1689                              XMMRegister vtmp1, XMMRegister vtmp2) {
1690   switch (vlen) {
1691     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1692     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1693     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1694     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1695 
1696     default: assert(false, "wrong vector length");
1697   }
1698 }
1699 
1700 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1701                              Register dst, Register src1, XMMRegister src2,
1702                              XMMRegister vtmp1, XMMRegister vtmp2) {
1703   switch (vlen) {
1704     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1705     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1706     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1707     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1708 
1709     default: assert(false, "wrong vector length");
1710   }
1711 }
1712 
1713 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1714                              Register dst, Register src1, XMMRegister src2,
1715                              XMMRegister vtmp1, XMMRegister vtmp2) {
1716   switch (vlen) {
1717     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1718     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1719     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1720     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1721 
1722     default: assert(false, "wrong vector length");
1723   }
1724 }
1725 
1726 #ifdef _LP64
1727 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1728                              Register dst, Register src1, XMMRegister src2,
1729                              XMMRegister vtmp1, XMMRegister vtmp2) {
1730   switch (vlen) {
1731     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1732     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1733     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1734 
1735     default: assert(false, "wrong vector length");
1736   }
1737 }
1738 #endif // _LP64
1739 
1740 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1741   switch (vlen) {
1742     case 2:
1743       assert(vtmp2 == xnoreg, "");
1744       reduce2F(opcode, dst, src, vtmp1);
1745       break;
1746     case 4:
1747       assert(vtmp2 == xnoreg, "");
1748       reduce4F(opcode, dst, src, vtmp1);
1749       break;
1750     case 8:
1751       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1752       break;
1753     case 16:
1754       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1755       break;
1756     default: assert(false, "wrong vector length");
1757   }
1758 }
1759 
1760 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1761   switch (vlen) {
1762     case 2:
1763       assert(vtmp2 == xnoreg, "");
1764       reduce2D(opcode, dst, src, vtmp1);
1765       break;
1766     case 4:
1767       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1768       break;
1769     case 8:
1770       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1771       break;
1772     default: assert(false, "wrong vector length");
1773   }
1774 }
1775 
1776 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1777   if (opcode == Op_AddReductionVI) {
1778     if (vtmp1 != src2) {
1779       movdqu(vtmp1, src2);
1780     }
1781     phaddd(vtmp1, vtmp1);
1782   } else {
1783     pshufd(vtmp1, src2, 0x1);
1784     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1785   }
1786   movdl(vtmp2, src1);
1787   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1788   movdl(dst, vtmp1);
1789 }
1790 
1791 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1792   if (opcode == Op_AddReductionVI) {
1793     if (vtmp1 != src2) {
1794       movdqu(vtmp1, src2);
1795     }
1796     phaddd(vtmp1, src2);
1797     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1798   } else {
1799     pshufd(vtmp2, src2, 0xE);
1800     reduce_operation_128(T_INT, opcode, vtmp2, src2);
1801     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1802   }
1803 }
1804 
1805 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1806   if (opcode == Op_AddReductionVI) {
1807     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1808     vextracti128_high(vtmp2, vtmp1);
1809     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1810     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1811   } else {
1812     vextracti128_high(vtmp1, src2);
1813     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1814     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1815   }
1816 }
1817 
1818 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1819   vextracti64x4_high(vtmp2, src2);
1820   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
1821   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1822 }
1823 
1824 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1825   pshufd(vtmp2, src2, 0x1);
1826   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1827   movdqu(vtmp1, vtmp2);
1828   psrldq(vtmp1, 2);
1829   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1830   movdqu(vtmp2, vtmp1);
1831   psrldq(vtmp2, 1);
1832   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1833   movdl(vtmp2, src1);
1834   pmovsxbd(vtmp1, vtmp1);
1835   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1836   pextrb(dst, vtmp1, 0x0);
1837   movsbl(dst, dst);
1838 }
1839 
1840 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1841   pshufd(vtmp1, src2, 0xE);
1842   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
1843   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1844 }
1845 
1846 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1847   vextracti128_high(vtmp2, src2);
1848   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1849   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1850 }
1851 
1852 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1853   vextracti64x4_high(vtmp1, src2);
1854   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
1855   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1856 }
1857 
1858 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1859   pmovsxbw(vtmp2, src2);
1860   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1861 }
1862 
1863 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1864   if (UseAVX > 1) {
1865     int vector_len = Assembler::AVX_256bit;
1866     vpmovsxbw(vtmp1, src2, vector_len);
1867     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1868   } else {
1869     pmovsxbw(vtmp2, src2);
1870     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1871     pshufd(vtmp2, src2, 0x1);
1872     pmovsxbw(vtmp2, src2);
1873     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1874   }
1875 }
1876 
1877 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1878   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
1879     int vector_len = Assembler::AVX_512bit;
1880     vpmovsxbw(vtmp1, src2, vector_len);
1881     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1882   } else {
1883     assert(UseAVX >= 2,"Should not reach here.");
1884     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
1885     vextracti128_high(vtmp2, src2);
1886     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1887   }
1888 }
1889 
1890 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1891   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
1892   vextracti64x4_high(vtmp2, src2);
1893   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1894 }
1895 
1896 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1897   if (opcode == Op_AddReductionVI) {
1898     if (vtmp1 != src2) {
1899       movdqu(vtmp1, src2);
1900     }
1901     phaddw(vtmp1, vtmp1);
1902     phaddw(vtmp1, vtmp1);
1903   } else {
1904     pshufd(vtmp2, src2, 0x1);
1905     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1906     movdqu(vtmp1, vtmp2);
1907     psrldq(vtmp1, 2);
1908     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
1909   }
1910   movdl(vtmp2, src1);
1911   pmovsxwd(vtmp1, vtmp1);
1912   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1913   pextrw(dst, vtmp1, 0x0);
1914   movswl(dst, dst);
1915 }
1916 
1917 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1918   if (opcode == Op_AddReductionVI) {
1919     if (vtmp1 != src2) {
1920       movdqu(vtmp1, src2);
1921     }
1922     phaddw(vtmp1, src2);
1923   } else {
1924     pshufd(vtmp1, src2, 0xE);
1925     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
1926   }
1927   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1928 }
1929 
1930 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1931   if (opcode == Op_AddReductionVI) {
1932     int vector_len = Assembler::AVX_256bit;
1933     vphaddw(vtmp2, src2, src2, vector_len);
1934     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
1935   } else {
1936     vextracti128_high(vtmp2, src2);
1937     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1938   }
1939   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1940 }
1941 
1942 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1943   int vector_len = Assembler::AVX_256bit;
1944   vextracti64x4_high(vtmp1, src2);
1945   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
1946   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1947 }
1948 
1949 #ifdef _LP64
1950 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1951   pshufd(vtmp2, src2, 0xE);
1952   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
1953   movdq(vtmp1, src1);
1954   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
1955   movdq(dst, vtmp1);
1956 }
1957 
1958 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1959   vextracti128_high(vtmp1, src2);
1960   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
1961   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1962 }
1963 
1964 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1965   vextracti64x4_high(vtmp2, src2);
1966   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
1967   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1968 }
1969 
1970 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
1971   assert(ArrayOperationPartialInlineSize > 0 && ArrayOperationPartialInlineSize <= 64, "invalid");
1972   mov64(temp, -1L);
1973   bzhiq(temp, temp, len);
1974   kmovql(dst, temp);
1975 }
1976 #endif // _LP64
1977 
1978 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1979   reduce_operation_128(T_FLOAT, opcode, dst, src);
1980   pshufd(vtmp, src, 0x1);
1981   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1982 }
1983 
1984 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1985   reduce2F(opcode, dst, src, vtmp);
1986   pshufd(vtmp, src, 0x2);
1987   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1988   pshufd(vtmp, src, 0x3);
1989   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1990 }
1991 
1992 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1993   reduce4F(opcode, dst, src, vtmp2);
1994   vextractf128_high(vtmp2, src);
1995   reduce4F(opcode, dst, vtmp2, vtmp1);
1996 }
1997 
1998 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1999   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2000   vextracti64x4_high(vtmp1, src);
2001   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2002 }
2003 
2004 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2005   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2006   pshufd(vtmp, src, 0xE);
2007   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2008 }
2009 
2010 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2011   reduce2D(opcode, dst, src, vtmp2);
2012   vextractf128_high(vtmp2, src);
2013   reduce2D(opcode, dst, vtmp2, vtmp1);
2014 }
2015 
2016 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2017   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2018   vextracti64x4_high(vtmp1, src);
2019   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2020 }
2021 
2022 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) {
2023   MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
2024 }
2025 
2026 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) {
2027   MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
2028 }
2029 
2030 
2031 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2032                                           XMMRegister dst, XMMRegister src,
2033                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2034                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2035   int permconst[] = {1, 14};
2036   XMMRegister wsrc = src;
2037   XMMRegister wdst = xmm_0;
2038   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2039 
2040   int vlen_enc = Assembler::AVX_128bit;
2041   if (vlen == 16) {
2042     vlen_enc = Assembler::AVX_256bit;
2043   }
2044 
2045   for (int i = log2(vlen) - 1; i >=0; i--) {
2046     if (i == 0 && !is_dst_valid) {
2047       wdst = dst;
2048     }
2049     if (i == 3) {
2050       vextracti64x4_high(wtmp, wsrc);
2051     } else if (i == 2) {
2052       vextracti128_high(wtmp, wsrc);
2053     } else { // i = [0,1]
2054       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2055     }
2056     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2057     wsrc = wdst;
2058     vlen_enc = Assembler::AVX_128bit;
2059   }
2060   if (is_dst_valid) {
2061     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2062   }
2063 }
2064 
2065 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2066                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2067                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2068   XMMRegister wsrc = src;
2069   XMMRegister wdst = xmm_0;
2070   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2071   int vlen_enc = Assembler::AVX_128bit;
2072   if (vlen == 8) {
2073     vlen_enc = Assembler::AVX_256bit;
2074   }
2075   for (int i = log2(vlen) - 1; i >=0; i--) {
2076     if (i == 0 && !is_dst_valid) {
2077       wdst = dst;
2078     }
2079     if (i == 1) {
2080       vextracti128_high(wtmp, wsrc);
2081     } else if (i == 2) {
2082       vextracti64x4_high(wtmp, wsrc);
2083     } else {
2084       assert(i == 0, "%d", i);
2085       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2086     }
2087     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2088     wsrc = wdst;
2089     vlen_enc = Assembler::AVX_128bit;
2090   }
2091   if (is_dst_valid) {
2092     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2093   }
2094 }
2095 
2096 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2097   switch (bt) {
2098     case T_BYTE:  pextrb(dst, src, idx); break;
2099     case T_SHORT: pextrw(dst, src, idx); break;
2100     case T_INT:   pextrd(dst, src, idx); break;
2101     case T_LONG:  pextrq(dst, src, idx); break;
2102 
2103     default:
2104       assert(false,"Should not reach here.");
2105       break;
2106   }
2107 }
2108 
2109 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2110   int esize =  type2aelembytes(typ);
2111   int elem_per_lane = 16/esize;
2112   int lane = elemindex / elem_per_lane;
2113   int eindex = elemindex % elem_per_lane;
2114 
2115   if (lane >= 2) {
2116     assert(UseAVX > 2, "required");
2117     vextractf32x4(dst, src, lane & 3);
2118     return dst;
2119   } else if (lane > 0) {
2120     assert(UseAVX > 0, "required");
2121     vextractf128(dst, src, lane);
2122     return dst;
2123   } else {
2124     return src;
2125   }
2126 }
2127 
2128 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2129   if (typ == T_BYTE) {
2130     movsbl(dst, dst);
2131   } else if (typ == T_SHORT) {
2132     movswl(dst, dst);
2133   }
2134 }
2135 
2136 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2137   int esize =  type2aelembytes(typ);
2138   int elem_per_lane = 16/esize;
2139   int eindex = elemindex % elem_per_lane;
2140   assert(is_integral_type(typ),"required");
2141 
2142   if (eindex == 0) {
2143     if (typ == T_LONG) {
2144       movq(dst, src);
2145     } else {
2146       movdl(dst, src);
2147       movsxl(typ, dst);
2148     }
2149   } else {
2150     extract(typ, dst, src, eindex);
2151     movsxl(typ, dst);
2152   }
2153 }
2154 
2155 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) {
2156   int esize =  type2aelembytes(typ);
2157   int elem_per_lane = 16/esize;
2158   int eindex = elemindex % elem_per_lane;
2159   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2160 
2161   if (eindex == 0) {
2162     movq(dst, src);
2163   } else {
2164     if (typ == T_FLOAT) {
2165       if (UseAVX == 0) {
2166         movdqu(dst, src);
2167         pshufps(dst, dst, eindex);
2168       } else {
2169         vpshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2170       }
2171     } else {
2172       if (UseAVX == 0) {
2173         movdqu(dst, src);
2174         psrldq(dst, eindex*esize);
2175       } else {
2176         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2177       }
2178       movq(dst, dst);
2179     }
2180   }
2181   // Zero upper bits
2182   if (typ == T_FLOAT) {
2183     if (UseAVX == 0) {
2184       assert((vtmp != xnoreg) && (tmp != noreg), "required.");
2185       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp);
2186       pand(dst, vtmp);
2187     } else {
2188       assert((tmp != noreg), "required.");
2189       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp);
2190     }
2191   }
2192 }
2193 
2194 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2195   switch(typ) {
2196     case T_BYTE:
2197     case T_BOOLEAN:
2198       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2199       break;
2200     case T_SHORT:
2201     case T_CHAR:
2202       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2203       break;
2204     case T_INT:
2205     case T_FLOAT:
2206       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2207       break;
2208     case T_LONG:
2209     case T_DOUBLE:
2210       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2211       break;
2212     default:
2213       assert(false,"Should not reach here.");
2214       break;
2215   }
2216 }
2217 
2218 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) {
2219   switch(typ) {
2220     case T_BOOLEAN:
2221     case T_BYTE:
2222       evpcmpb(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2223       break;
2224     case T_CHAR:
2225     case T_SHORT:
2226       evpcmpw(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2227       break;
2228     case T_INT:
2229     case T_FLOAT:
2230       evpcmpd(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2231       break;
2232     case T_LONG:
2233     case T_DOUBLE:
2234       evpcmpq(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2235       break;
2236     default:
2237       assert(false,"Should not reach here.");
2238       break;
2239   }
2240 }
2241 
2242 void C2_MacroAssembler::vpcmpu(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison,
2243                             int vlen_in_bytes, XMMRegister vtmp1, XMMRegister vtmp2, Register scratch) {
2244   int vlen_enc = vector_length_encoding(vlen_in_bytes*2);
2245   switch (typ) {
2246   case T_BYTE:
2247     vpmovzxbw(vtmp1, src1, vlen_enc);
2248     vpmovzxbw(vtmp2, src2, vlen_enc);
2249     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2250     vpacksswb(dst, dst, dst, vlen_enc);
2251     break;
2252   case T_SHORT:
2253     vpmovzxwd(vtmp1, src1, vlen_enc);
2254     vpmovzxwd(vtmp2, src2, vlen_enc);
2255     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);
2256     vpackssdw(dst, dst, dst, vlen_enc);
2257     break;
2258   case T_INT:
2259     vpmovzxdq(vtmp1, src1, vlen_enc);
2260     vpmovzxdq(vtmp2, src2, vlen_enc);
2261     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);
2262     vpermilps(dst, dst, 8, vlen_enc);
2263     break;
2264   default:
2265     assert(false, "Should not reach here");
2266   }
2267   if (vlen_in_bytes == 16) {
2268     vpermpd(dst, dst, 0x8, vlen_enc);
2269   }
2270 }
2271 
2272 void C2_MacroAssembler::vpcmpu32(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison, int vlen_in_bytes,
2273                               XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, Register scratch) {
2274   int vlen_enc = vector_length_encoding(vlen_in_bytes);
2275   switch (typ) {
2276   case T_BYTE:
2277     vpmovzxbw(vtmp1, src1, vlen_enc);
2278     vpmovzxbw(vtmp2, src2, vlen_enc);
2279     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2280     vextracti128(vtmp1, src1, 1);
2281     vextracti128(vtmp2, src2, 1);
2282     vpmovzxbw(vtmp1, vtmp1, vlen_enc);
2283     vpmovzxbw(vtmp2, vtmp2, vlen_enc);
2284     vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2285     vpacksswb(dst, dst, vtmp3, vlen_enc);
2286     vpermpd(dst, dst, 0xd8, vlen_enc);
2287     break;
2288   case T_SHORT:
2289     vpmovzxwd(vtmp1, src1, vlen_enc);
2290     vpmovzxwd(vtmp2, src2, vlen_enc);
2291     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);
2292     vextracti128(vtmp1, src1, 1);
2293     vextracti128(vtmp2, src2, 1);
2294     vpmovzxwd(vtmp1, vtmp1, vlen_enc);
2295     vpmovzxwd(vtmp2, vtmp2, vlen_enc);
2296     vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::D,  vlen_enc, scratch);
2297     vpackssdw(dst, dst, vtmp3, vlen_enc);
2298     vpermpd(dst, dst, 0xd8, vlen_enc);
2299     break;
2300   case T_INT:
2301     vpmovzxdq(vtmp1, src1, vlen_enc);
2302     vpmovzxdq(vtmp2, src2, vlen_enc);
2303     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);
2304     vpshufd(dst, dst, 8, vlen_enc);
2305     vpermq(dst, dst, 8, vlen_enc);
2306     vextracti128(vtmp1, src1, 1);
2307     vextracti128(vtmp2, src2, 1);
2308     vpmovzxdq(vtmp1, vtmp1, vlen_enc);
2309     vpmovzxdq(vtmp2, vtmp2, vlen_enc);
2310     vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::Q,  vlen_enc, scratch);
2311     vpshufd(vtmp3, vtmp3, 8, vlen_enc);
2312     vpermq(vtmp3, vtmp3, 0x80, vlen_enc);
2313     vpblendd(dst, dst, vtmp3, 0xf0, vlen_enc);
2314     break;
2315   default:
2316     assert(false, "Should not reach here");
2317   }
2318 }
2319 
2320 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2321   switch(typ) {
2322     case T_BYTE:
2323       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2324       break;
2325     case T_SHORT:
2326       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2327       break;
2328     case T_INT:
2329     case T_FLOAT:
2330       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2331       break;
2332     case T_LONG:
2333     case T_DOUBLE:
2334       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2335       break;
2336     default:
2337       assert(false,"Should not reach here.");
2338       break;
2339   }
2340 }
2341 
2342 void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2,
2343                                    XMMRegister vtmp1, XMMRegister vtmp2, KRegister mask) {
2344   switch(vlen) {
2345     case 4:
2346       assert(vtmp1 != xnoreg, "required.");
2347       // Broadcast lower 32 bits to 128 bits before ptest
2348       pshufd(vtmp1, src1, 0x0);
2349       if (bt == BoolTest::overflow) {
2350         assert(vtmp2 != xnoreg, "required.");
2351         pshufd(vtmp2, src2, 0x0);
2352       } else {
2353         assert(vtmp2 == xnoreg, "required.");
2354         vtmp2 = src2;
2355       }
2356       ptest(vtmp1, vtmp2);
2357      break;
2358     case 8:
2359       assert(vtmp1 != xnoreg, "required.");
2360       // Broadcast lower 64 bits to 128 bits before ptest
2361       pshufd(vtmp1, src1, 0x4);
2362       if (bt == BoolTest::overflow) {
2363         assert(vtmp2 != xnoreg, "required.");
2364         pshufd(vtmp2, src2, 0x4);
2365       } else {
2366         assert(vtmp2 == xnoreg, "required.");
2367         vtmp2 = src2;
2368       }
2369       ptest(vtmp1, vtmp2);
2370      break;
2371     case 16:
2372       assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2373       ptest(src1, src2);
2374       break;
2375     case 32:
2376       assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2377       vptest(src1, src2, Assembler::AVX_256bit);
2378       break;
2379     case 64:
2380       {
2381         assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2382         evpcmpeqb(mask, src1, src2, Assembler::AVX_512bit);
2383         if (bt == BoolTest::ne) {
2384           ktestql(mask, mask);
2385         } else {
2386           assert(bt == BoolTest::overflow, "required");
2387           kortestql(mask, mask);
2388         }
2389       }
2390       break;
2391     default:
2392       assert(false,"Should not reach here.");
2393       break;
2394   }
2395 }
2396 
2397 //-------------------------------------------------------------------------------------------
2398 
2399 // IndexOf for constant substrings with size >= 8 chars
2400 // which don't need to be loaded through stack.
2401 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2402                                          Register cnt1, Register cnt2,
2403                                          int int_cnt2,  Register result,
2404                                          XMMRegister vec, Register tmp,
2405                                          int ae) {
2406   ShortBranchVerifier sbv(this);
2407   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2408   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2409 
2410   // This method uses the pcmpestri instruction with bound registers
2411   //   inputs:
2412   //     xmm - substring
2413   //     rax - substring length (elements count)
2414   //     mem - scanned string
2415   //     rdx - string length (elements count)
2416   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2417   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2418   //   outputs:
2419   //     rcx - matched index in string
2420   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2421   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2422   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2423   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2424   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2425 
2426   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2427         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2428         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2429 
2430   // Note, inline_string_indexOf() generates checks:
2431   // if (substr.count > string.count) return -1;
2432   // if (substr.count == 0) return 0;
2433   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2434 
2435   // Load substring.
2436   if (ae == StrIntrinsicNode::UL) {
2437     pmovzxbw(vec, Address(str2, 0));
2438   } else {
2439     movdqu(vec, Address(str2, 0));
2440   }
2441   movl(cnt2, int_cnt2);
2442   movptr(result, str1); // string addr
2443 
2444   if (int_cnt2 > stride) {
2445     jmpb(SCAN_TO_SUBSTR);
2446 
2447     // Reload substr for rescan, this code
2448     // is executed only for large substrings (> 8 chars)
2449     bind(RELOAD_SUBSTR);
2450     if (ae == StrIntrinsicNode::UL) {
2451       pmovzxbw(vec, Address(str2, 0));
2452     } else {
2453       movdqu(vec, Address(str2, 0));
2454     }
2455     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2456 
2457     bind(RELOAD_STR);
2458     // We came here after the beginning of the substring was
2459     // matched but the rest of it was not so we need to search
2460     // again. Start from the next element after the previous match.
2461 
2462     // cnt2 is number of substring reminding elements and
2463     // cnt1 is number of string reminding elements when cmp failed.
2464     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2465     subl(cnt1, cnt2);
2466     addl(cnt1, int_cnt2);
2467     movl(cnt2, int_cnt2); // Now restore cnt2
2468 
2469     decrementl(cnt1);     // Shift to next element
2470     cmpl(cnt1, cnt2);
2471     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2472 
2473     addptr(result, (1<<scale1));
2474 
2475   } // (int_cnt2 > 8)
2476 
2477   // Scan string for start of substr in 16-byte vectors
2478   bind(SCAN_TO_SUBSTR);
2479   pcmpestri(vec, Address(result, 0), mode);
2480   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2481   subl(cnt1, stride);
2482   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2483   cmpl(cnt1, cnt2);
2484   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2485   addptr(result, 16);
2486   jmpb(SCAN_TO_SUBSTR);
2487 
2488   // Found a potential substr
2489   bind(FOUND_CANDIDATE);
2490   // Matched whole vector if first element matched (tmp(rcx) == 0).
2491   if (int_cnt2 == stride) {
2492     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2493   } else { // int_cnt2 > 8
2494     jccb(Assembler::overflow, FOUND_SUBSTR);
2495   }
2496   // After pcmpestri tmp(rcx) contains matched element index
2497   // Compute start addr of substr
2498   lea(result, Address(result, tmp, scale1));
2499 
2500   // Make sure string is still long enough
2501   subl(cnt1, tmp);
2502   cmpl(cnt1, cnt2);
2503   if (int_cnt2 == stride) {
2504     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2505   } else { // int_cnt2 > 8
2506     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2507   }
2508   // Left less then substring.
2509 
2510   bind(RET_NOT_FOUND);
2511   movl(result, -1);
2512   jmp(EXIT);
2513 
2514   if (int_cnt2 > stride) {
2515     // This code is optimized for the case when whole substring
2516     // is matched if its head is matched.
2517     bind(MATCH_SUBSTR_HEAD);
2518     pcmpestri(vec, Address(result, 0), mode);
2519     // Reload only string if does not match
2520     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2521 
2522     Label CONT_SCAN_SUBSTR;
2523     // Compare the rest of substring (> 8 chars).
2524     bind(FOUND_SUBSTR);
2525     // First 8 chars are already matched.
2526     negptr(cnt2);
2527     addptr(cnt2, stride);
2528 
2529     bind(SCAN_SUBSTR);
2530     subl(cnt1, stride);
2531     cmpl(cnt2, -stride); // Do not read beyond substring
2532     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2533     // Back-up strings to avoid reading beyond substring:
2534     // cnt1 = cnt1 - cnt2 + 8
2535     addl(cnt1, cnt2); // cnt2 is negative
2536     addl(cnt1, stride);
2537     movl(cnt2, stride); negptr(cnt2);
2538     bind(CONT_SCAN_SUBSTR);
2539     if (int_cnt2 < (int)G) {
2540       int tail_off1 = int_cnt2<<scale1;
2541       int tail_off2 = int_cnt2<<scale2;
2542       if (ae == StrIntrinsicNode::UL) {
2543         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2544       } else {
2545         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2546       }
2547       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2548     } else {
2549       // calculate index in register to avoid integer overflow (int_cnt2*2)
2550       movl(tmp, int_cnt2);
2551       addptr(tmp, cnt2);
2552       if (ae == StrIntrinsicNode::UL) {
2553         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2554       } else {
2555         movdqu(vec, Address(str2, tmp, scale2, 0));
2556       }
2557       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2558     }
2559     // Need to reload strings pointers if not matched whole vector
2560     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2561     addptr(cnt2, stride);
2562     jcc(Assembler::negative, SCAN_SUBSTR);
2563     // Fall through if found full substring
2564 
2565   } // (int_cnt2 > 8)
2566 
2567   bind(RET_FOUND);
2568   // Found result if we matched full small substring.
2569   // Compute substr offset
2570   subptr(result, str1);
2571   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2572     shrl(result, 1); // index
2573   }
2574   bind(EXIT);
2575 
2576 } // string_indexofC8
2577 
2578 // Small strings are loaded through stack if they cross page boundary.
2579 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2580                                        Register cnt1, Register cnt2,
2581                                        int int_cnt2,  Register result,
2582                                        XMMRegister vec, Register tmp,
2583                                        int ae) {
2584   ShortBranchVerifier sbv(this);
2585   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2586   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2587 
2588   //
2589   // int_cnt2 is length of small (< 8 chars) constant substring
2590   // or (-1) for non constant substring in which case its length
2591   // is in cnt2 register.
2592   //
2593   // Note, inline_string_indexOf() generates checks:
2594   // if (substr.count > string.count) return -1;
2595   // if (substr.count == 0) return 0;
2596   //
2597   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2598   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2599   // This method uses the pcmpestri instruction with bound registers
2600   //   inputs:
2601   //     xmm - substring
2602   //     rax - substring length (elements count)
2603   //     mem - scanned string
2604   //     rdx - string length (elements count)
2605   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2606   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2607   //   outputs:
2608   //     rcx - matched index in string
2609   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2610   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2611   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2612   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2613 
2614   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2615         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2616         FOUND_CANDIDATE;
2617 
2618   { //========================================================
2619     // We don't know where these strings are located
2620     // and we can't read beyond them. Load them through stack.
2621     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2622 
2623     movptr(tmp, rsp); // save old SP
2624 
2625     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2626       if (int_cnt2 == (1>>scale2)) { // One byte
2627         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2628         load_unsigned_byte(result, Address(str2, 0));
2629         movdl(vec, result); // move 32 bits
2630       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2631         // Not enough header space in 32-bit VM: 12+3 = 15.
2632         movl(result, Address(str2, -1));
2633         shrl(result, 8);
2634         movdl(vec, result); // move 32 bits
2635       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2636         load_unsigned_short(result, Address(str2, 0));
2637         movdl(vec, result); // move 32 bits
2638       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2639         movdl(vec, Address(str2, 0)); // move 32 bits
2640       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2641         movq(vec, Address(str2, 0));  // move 64 bits
2642       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2643         // Array header size is 12 bytes in 32-bit VM
2644         // + 6 bytes for 3 chars == 18 bytes,
2645         // enough space to load vec and shift.
2646         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2647         if (ae == StrIntrinsicNode::UL) {
2648           int tail_off = int_cnt2-8;
2649           pmovzxbw(vec, Address(str2, tail_off));
2650           psrldq(vec, -2*tail_off);
2651         }
2652         else {
2653           int tail_off = int_cnt2*(1<<scale2);
2654           movdqu(vec, Address(str2, tail_off-16));
2655           psrldq(vec, 16-tail_off);
2656         }
2657       }
2658     } else { // not constant substring
2659       cmpl(cnt2, stride);
2660       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2661 
2662       // We can read beyond string if srt+16 does not cross page boundary
2663       // since heaps are aligned and mapped by pages.
2664       assert(os::vm_page_size() < (int)G, "default page should be small");
2665       movl(result, str2); // We need only low 32 bits
2666       andl(result, (os::vm_page_size()-1));
2667       cmpl(result, (os::vm_page_size()-16));
2668       jccb(Assembler::belowEqual, CHECK_STR);
2669 
2670       // Move small strings to stack to allow load 16 bytes into vec.
2671       subptr(rsp, 16);
2672       int stk_offset = wordSize-(1<<scale2);
2673       push(cnt2);
2674 
2675       bind(COPY_SUBSTR);
2676       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2677         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2678         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2679       } else if (ae == StrIntrinsicNode::UU) {
2680         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2681         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2682       }
2683       decrement(cnt2);
2684       jccb(Assembler::notZero, COPY_SUBSTR);
2685 
2686       pop(cnt2);
2687       movptr(str2, rsp);  // New substring address
2688     } // non constant
2689 
2690     bind(CHECK_STR);
2691     cmpl(cnt1, stride);
2692     jccb(Assembler::aboveEqual, BIG_STRINGS);
2693 
2694     // Check cross page boundary.
2695     movl(result, str1); // We need only low 32 bits
2696     andl(result, (os::vm_page_size()-1));
2697     cmpl(result, (os::vm_page_size()-16));
2698     jccb(Assembler::belowEqual, BIG_STRINGS);
2699 
2700     subptr(rsp, 16);
2701     int stk_offset = -(1<<scale1);
2702     if (int_cnt2 < 0) { // not constant
2703       push(cnt2);
2704       stk_offset += wordSize;
2705     }
2706     movl(cnt2, cnt1);
2707 
2708     bind(COPY_STR);
2709     if (ae == StrIntrinsicNode::LL) {
2710       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2711       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2712     } else {
2713       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2714       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2715     }
2716     decrement(cnt2);
2717     jccb(Assembler::notZero, COPY_STR);
2718 
2719     if (int_cnt2 < 0) { // not constant
2720       pop(cnt2);
2721     }
2722     movptr(str1, rsp);  // New string address
2723 
2724     bind(BIG_STRINGS);
2725     // Load substring.
2726     if (int_cnt2 < 0) { // -1
2727       if (ae == StrIntrinsicNode::UL) {
2728         pmovzxbw(vec, Address(str2, 0));
2729       } else {
2730         movdqu(vec, Address(str2, 0));
2731       }
2732       push(cnt2);       // substr count
2733       push(str2);       // substr addr
2734       push(str1);       // string addr
2735     } else {
2736       // Small (< 8 chars) constant substrings are loaded already.
2737       movl(cnt2, int_cnt2);
2738     }
2739     push(tmp);  // original SP
2740 
2741   } // Finished loading
2742 
2743   //========================================================
2744   // Start search
2745   //
2746 
2747   movptr(result, str1); // string addr
2748 
2749   if (int_cnt2  < 0) {  // Only for non constant substring
2750     jmpb(SCAN_TO_SUBSTR);
2751 
2752     // SP saved at sp+0
2753     // String saved at sp+1*wordSize
2754     // Substr saved at sp+2*wordSize
2755     // Substr count saved at sp+3*wordSize
2756 
2757     // Reload substr for rescan, this code
2758     // is executed only for large substrings (> 8 chars)
2759     bind(RELOAD_SUBSTR);
2760     movptr(str2, Address(rsp, 2*wordSize));
2761     movl(cnt2, Address(rsp, 3*wordSize));
2762     if (ae == StrIntrinsicNode::UL) {
2763       pmovzxbw(vec, Address(str2, 0));
2764     } else {
2765       movdqu(vec, Address(str2, 0));
2766     }
2767     // We came here after the beginning of the substring was
2768     // matched but the rest of it was not so we need to search
2769     // again. Start from the next element after the previous match.
2770     subptr(str1, result); // Restore counter
2771     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2772       shrl(str1, 1);
2773     }
2774     addl(cnt1, str1);
2775     decrementl(cnt1);   // Shift to next element
2776     cmpl(cnt1, cnt2);
2777     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2778 
2779     addptr(result, (1<<scale1));
2780   } // non constant
2781 
2782   // Scan string for start of substr in 16-byte vectors
2783   bind(SCAN_TO_SUBSTR);
2784   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2785   pcmpestri(vec, Address(result, 0), mode);
2786   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2787   subl(cnt1, stride);
2788   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2789   cmpl(cnt1, cnt2);
2790   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2791   addptr(result, 16);
2792 
2793   bind(ADJUST_STR);
2794   cmpl(cnt1, stride); // Do not read beyond string
2795   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2796   // Back-up string to avoid reading beyond string.
2797   lea(result, Address(result, cnt1, scale1, -16));
2798   movl(cnt1, stride);
2799   jmpb(SCAN_TO_SUBSTR);
2800 
2801   // Found a potential substr
2802   bind(FOUND_CANDIDATE);
2803   // After pcmpestri tmp(rcx) contains matched element index
2804 
2805   // Make sure string is still long enough
2806   subl(cnt1, tmp);
2807   cmpl(cnt1, cnt2);
2808   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
2809   // Left less then substring.
2810 
2811   bind(RET_NOT_FOUND);
2812   movl(result, -1);
2813   jmp(CLEANUP);
2814 
2815   bind(FOUND_SUBSTR);
2816   // Compute start addr of substr
2817   lea(result, Address(result, tmp, scale1));
2818   if (int_cnt2 > 0) { // Constant substring
2819     // Repeat search for small substring (< 8 chars)
2820     // from new point without reloading substring.
2821     // Have to check that we don't read beyond string.
2822     cmpl(tmp, stride-int_cnt2);
2823     jccb(Assembler::greater, ADJUST_STR);
2824     // Fall through if matched whole substring.
2825   } else { // non constant
2826     assert(int_cnt2 == -1, "should be != 0");
2827 
2828     addl(tmp, cnt2);
2829     // Found result if we matched whole substring.
2830     cmpl(tmp, stride);
2831     jcc(Assembler::lessEqual, RET_FOUND);
2832 
2833     // Repeat search for small substring (<= 8 chars)
2834     // from new point 'str1' without reloading substring.
2835     cmpl(cnt2, stride);
2836     // Have to check that we don't read beyond string.
2837     jccb(Assembler::lessEqual, ADJUST_STR);
2838 
2839     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
2840     // Compare the rest of substring (> 8 chars).
2841     movptr(str1, result);
2842 
2843     cmpl(tmp, cnt2);
2844     // First 8 chars are already matched.
2845     jccb(Assembler::equal, CHECK_NEXT);
2846 
2847     bind(SCAN_SUBSTR);
2848     pcmpestri(vec, Address(str1, 0), mode);
2849     // Need to reload strings pointers if not matched whole vector
2850     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2851 
2852     bind(CHECK_NEXT);
2853     subl(cnt2, stride);
2854     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
2855     addptr(str1, 16);
2856     if (ae == StrIntrinsicNode::UL) {
2857       addptr(str2, 8);
2858     } else {
2859       addptr(str2, 16);
2860     }
2861     subl(cnt1, stride);
2862     cmpl(cnt2, stride); // Do not read beyond substring
2863     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
2864     // Back-up strings to avoid reading beyond substring.
2865 
2866     if (ae == StrIntrinsicNode::UL) {
2867       lea(str2, Address(str2, cnt2, scale2, -8));
2868       lea(str1, Address(str1, cnt2, scale1, -16));
2869     } else {
2870       lea(str2, Address(str2, cnt2, scale2, -16));
2871       lea(str1, Address(str1, cnt2, scale1, -16));
2872     }
2873     subl(cnt1, cnt2);
2874     movl(cnt2, stride);
2875     addl(cnt1, stride);
2876     bind(CONT_SCAN_SUBSTR);
2877     if (ae == StrIntrinsicNode::UL) {
2878       pmovzxbw(vec, Address(str2, 0));
2879     } else {
2880       movdqu(vec, Address(str2, 0));
2881     }
2882     jmp(SCAN_SUBSTR);
2883 
2884     bind(RET_FOUND_LONG);
2885     movptr(str1, Address(rsp, wordSize));
2886   } // non constant
2887 
2888   bind(RET_FOUND);
2889   // Compute substr offset
2890   subptr(result, str1);
2891   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2892     shrl(result, 1); // index
2893   }
2894   bind(CLEANUP);
2895   pop(rsp); // restore SP
2896 
2897 } // string_indexof
2898 
2899 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
2900                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
2901   ShortBranchVerifier sbv(this);
2902   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2903 
2904   int stride = 8;
2905 
2906   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
2907         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
2908         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
2909         FOUND_SEQ_CHAR, DONE_LABEL;
2910 
2911   movptr(result, str1);
2912   if (UseAVX >= 2) {
2913     cmpl(cnt1, stride);
2914     jcc(Assembler::less, SCAN_TO_CHAR);
2915     cmpl(cnt1, 2*stride);
2916     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
2917     movdl(vec1, ch);
2918     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
2919     vpxor(vec2, vec2);
2920     movl(tmp, cnt1);
2921     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
2922     andl(cnt1,0x0000000F);  //tail count (in chars)
2923 
2924     bind(SCAN_TO_16_CHAR_LOOP);
2925     vmovdqu(vec3, Address(result, 0));
2926     vpcmpeqw(vec3, vec3, vec1, 1);
2927     vptest(vec2, vec3);
2928     jcc(Assembler::carryClear, FOUND_CHAR);
2929     addptr(result, 32);
2930     subl(tmp, 2*stride);
2931     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
2932     jmp(SCAN_TO_8_CHAR);
2933     bind(SCAN_TO_8_CHAR_INIT);
2934     movdl(vec1, ch);
2935     pshuflw(vec1, vec1, 0x00);
2936     pshufd(vec1, vec1, 0);
2937     pxor(vec2, vec2);
2938   }
2939   bind(SCAN_TO_8_CHAR);
2940   cmpl(cnt1, stride);
2941   jcc(Assembler::less, SCAN_TO_CHAR);
2942   if (UseAVX < 2) {
2943     movdl(vec1, ch);
2944     pshuflw(vec1, vec1, 0x00);
2945     pshufd(vec1, vec1, 0);
2946     pxor(vec2, vec2);
2947   }
2948   movl(tmp, cnt1);
2949   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
2950   andl(cnt1,0x00000007);  //tail count (in chars)
2951 
2952   bind(SCAN_TO_8_CHAR_LOOP);
2953   movdqu(vec3, Address(result, 0));
2954   pcmpeqw(vec3, vec1);
2955   ptest(vec2, vec3);
2956   jcc(Assembler::carryClear, FOUND_CHAR);
2957   addptr(result, 16);
2958   subl(tmp, stride);
2959   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
2960   bind(SCAN_TO_CHAR);
2961   testl(cnt1, cnt1);
2962   jcc(Assembler::zero, RET_NOT_FOUND);
2963   bind(SCAN_TO_CHAR_LOOP);
2964   load_unsigned_short(tmp, Address(result, 0));
2965   cmpl(ch, tmp);
2966   jccb(Assembler::equal, FOUND_SEQ_CHAR);
2967   addptr(result, 2);
2968   subl(cnt1, 1);
2969   jccb(Assembler::zero, RET_NOT_FOUND);
2970   jmp(SCAN_TO_CHAR_LOOP);
2971 
2972   bind(RET_NOT_FOUND);
2973   movl(result, -1);
2974   jmpb(DONE_LABEL);
2975 
2976   bind(FOUND_CHAR);
2977   if (UseAVX >= 2) {
2978     vpmovmskb(tmp, vec3);
2979   } else {
2980     pmovmskb(tmp, vec3);
2981   }
2982   bsfl(ch, tmp);
2983   addptr(result, ch);
2984 
2985   bind(FOUND_SEQ_CHAR);
2986   subptr(result, str1);
2987   shrl(result, 1);
2988 
2989   bind(DONE_LABEL);
2990 } // string_indexof_char
2991 
2992 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
2993                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
2994   ShortBranchVerifier sbv(this);
2995   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2996 
2997   int stride = 16;
2998 
2999   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3000         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3001         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3002         FOUND_SEQ_CHAR, DONE_LABEL;
3003 
3004   movptr(result, str1);
3005   if (UseAVX >= 2) {
3006     cmpl(cnt1, stride);
3007     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3008     cmpl(cnt1, stride*2);
3009     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3010     movdl(vec1, ch);
3011     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3012     vpxor(vec2, vec2);
3013     movl(tmp, cnt1);
3014     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3015     andl(cnt1,0x0000001F);  //tail count (in chars)
3016 
3017     bind(SCAN_TO_32_CHAR_LOOP);
3018     vmovdqu(vec3, Address(result, 0));
3019     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3020     vptest(vec2, vec3);
3021     jcc(Assembler::carryClear, FOUND_CHAR);
3022     addptr(result, 32);
3023     subl(tmp, stride*2);
3024     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3025     jmp(SCAN_TO_16_CHAR);
3026 
3027     bind(SCAN_TO_16_CHAR_INIT);
3028     movdl(vec1, ch);
3029     pxor(vec2, vec2);
3030     pshufb(vec1, vec2);
3031   }
3032 
3033   bind(SCAN_TO_16_CHAR);
3034   cmpl(cnt1, stride);
3035   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entires left
3036   if (UseAVX < 2) {
3037     movdl(vec1, ch);
3038     pxor(vec2, vec2);
3039     pshufb(vec1, vec2);
3040   }
3041   movl(tmp, cnt1);
3042   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3043   andl(cnt1,0x0000000F);  //tail count (in bytes)
3044 
3045   bind(SCAN_TO_16_CHAR_LOOP);
3046   movdqu(vec3, Address(result, 0));
3047   pcmpeqb(vec3, vec1);
3048   ptest(vec2, vec3);
3049   jcc(Assembler::carryClear, FOUND_CHAR);
3050   addptr(result, 16);
3051   subl(tmp, stride);
3052   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3053 
3054   bind(SCAN_TO_CHAR_INIT);
3055   testl(cnt1, cnt1);
3056   jcc(Assembler::zero, RET_NOT_FOUND);
3057   bind(SCAN_TO_CHAR_LOOP);
3058   load_unsigned_byte(tmp, Address(result, 0));
3059   cmpl(ch, tmp);
3060   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3061   addptr(result, 1);
3062   subl(cnt1, 1);
3063   jccb(Assembler::zero, RET_NOT_FOUND);
3064   jmp(SCAN_TO_CHAR_LOOP);
3065 
3066   bind(RET_NOT_FOUND);
3067   movl(result, -1);
3068   jmpb(DONE_LABEL);
3069 
3070   bind(FOUND_CHAR);
3071   if (UseAVX >= 2) {
3072     vpmovmskb(tmp, vec3);
3073   } else {
3074     pmovmskb(tmp, vec3);
3075   }
3076   bsfl(ch, tmp);
3077   addptr(result, ch);
3078 
3079   bind(FOUND_SEQ_CHAR);
3080   subptr(result, str1);
3081 
3082   bind(DONE_LABEL);
3083 } // stringL_indexof_char
3084 
3085 // helper function for string_compare
3086 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3087                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3088                                            Address::ScaleFactor scale2, Register index, int ae) {
3089   if (ae == StrIntrinsicNode::LL) {
3090     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3091     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3092   } else if (ae == StrIntrinsicNode::UU) {
3093     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3094     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3095   } else {
3096     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3097     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3098   }
3099 }
3100 
3101 // Compare strings, used for char[] and byte[].
3102 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3103                                        Register cnt1, Register cnt2, Register result,
3104                                        XMMRegister vec1, int ae, KRegister mask) {
3105   ShortBranchVerifier sbv(this);
3106   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3107   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3108   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3109   int stride2x2 = 0x40;
3110   Address::ScaleFactor scale = Address::no_scale;
3111   Address::ScaleFactor scale1 = Address::no_scale;
3112   Address::ScaleFactor scale2 = Address::no_scale;
3113 
3114   if (ae != StrIntrinsicNode::LL) {
3115     stride2x2 = 0x20;
3116   }
3117 
3118   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3119     shrl(cnt2, 1);
3120   }
3121   // Compute the minimum of the string lengths and the
3122   // difference of the string lengths (stack).
3123   // Do the conditional move stuff
3124   movl(result, cnt1);
3125   subl(cnt1, cnt2);
3126   push(cnt1);
3127   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3128 
3129   // Is the minimum length zero?
3130   testl(cnt2, cnt2);
3131   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3132   if (ae == StrIntrinsicNode::LL) {
3133     // Load first bytes
3134     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3135     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3136   } else if (ae == StrIntrinsicNode::UU) {
3137     // Load first characters
3138     load_unsigned_short(result, Address(str1, 0));
3139     load_unsigned_short(cnt1, Address(str2, 0));
3140   } else {
3141     load_unsigned_byte(result, Address(str1, 0));
3142     load_unsigned_short(cnt1, Address(str2, 0));
3143   }
3144   subl(result, cnt1);
3145   jcc(Assembler::notZero,  POP_LABEL);
3146 
3147   if (ae == StrIntrinsicNode::UU) {
3148     // Divide length by 2 to get number of chars
3149     shrl(cnt2, 1);
3150   }
3151   cmpl(cnt2, 1);
3152   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3153 
3154   // Check if the strings start at the same location and setup scale and stride
3155   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3156     cmpptr(str1, str2);
3157     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3158     if (ae == StrIntrinsicNode::LL) {
3159       scale = Address::times_1;
3160       stride = 16;
3161     } else {
3162       scale = Address::times_2;
3163       stride = 8;
3164     }
3165   } else {
3166     scale1 = Address::times_1;
3167     scale2 = Address::times_2;
3168     // scale not used
3169     stride = 8;
3170   }
3171 
3172   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3173     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3174     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3175     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3176     Label COMPARE_TAIL_LONG;
3177     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3178 
3179     int pcmpmask = 0x19;
3180     if (ae == StrIntrinsicNode::LL) {
3181       pcmpmask &= ~0x01;
3182     }
3183 
3184     // Setup to compare 16-chars (32-bytes) vectors,
3185     // start from first character again because it has aligned address.
3186     if (ae == StrIntrinsicNode::LL) {
3187       stride2 = 32;
3188     } else {
3189       stride2 = 16;
3190     }
3191     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3192       adr_stride = stride << scale;
3193     } else {
3194       adr_stride1 = 8;  //stride << scale1;
3195       adr_stride2 = 16; //stride << scale2;
3196     }
3197 
3198     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3199     // rax and rdx are used by pcmpestri as elements counters
3200     movl(result, cnt2);
3201     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3202     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3203 
3204     // fast path : compare first 2 8-char vectors.
3205     bind(COMPARE_16_CHARS);
3206     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3207       movdqu(vec1, Address(str1, 0));
3208     } else {
3209       pmovzxbw(vec1, Address(str1, 0));
3210     }
3211     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3212     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3213 
3214     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3215       movdqu(vec1, Address(str1, adr_stride));
3216       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3217     } else {
3218       pmovzxbw(vec1, Address(str1, adr_stride1));
3219       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3220     }
3221     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3222     addl(cnt1, stride);
3223 
3224     // Compare the characters at index in cnt1
3225     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3226     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3227     subl(result, cnt2);
3228     jmp(POP_LABEL);
3229 
3230     // Setup the registers to start vector comparison loop
3231     bind(COMPARE_WIDE_VECTORS);
3232     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3233       lea(str1, Address(str1, result, scale));
3234       lea(str2, Address(str2, result, scale));
3235     } else {
3236       lea(str1, Address(str1, result, scale1));
3237       lea(str2, Address(str2, result, scale2));
3238     }
3239     subl(result, stride2);
3240     subl(cnt2, stride2);
3241     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3242     negptr(result);
3243 
3244     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3245     bind(COMPARE_WIDE_VECTORS_LOOP);
3246 
3247 #ifdef _LP64
3248     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3249       cmpl(cnt2, stride2x2);
3250       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3251       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3252       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3253 
3254       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3255       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3256         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3257         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3258       } else {
3259         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3260         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3261       }
3262       kortestql(mask, mask);
3263       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3264       addptr(result, stride2x2);  // update since we already compared at this addr
3265       subl(cnt2, stride2x2);      // and sub the size too
3266       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3267 
3268       vpxor(vec1, vec1);
3269       jmpb(COMPARE_WIDE_TAIL);
3270     }//if (VM_Version::supports_avx512vlbw())
3271 #endif // _LP64
3272 
3273 
3274     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3275     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3276       vmovdqu(vec1, Address(str1, result, scale));
3277       vpxor(vec1, Address(str2, result, scale));
3278     } else {
3279       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3280       vpxor(vec1, Address(str2, result, scale2));
3281     }
3282     vptest(vec1, vec1);
3283     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3284     addptr(result, stride2);
3285     subl(cnt2, stride2);
3286     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3287     // clean upper bits of YMM registers
3288     vpxor(vec1, vec1);
3289 
3290     // compare wide vectors tail
3291     bind(COMPARE_WIDE_TAIL);
3292     testptr(result, result);
3293     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3294 
3295     movl(result, stride2);
3296     movl(cnt2, result);
3297     negptr(result);
3298     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3299 
3300     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3301     bind(VECTOR_NOT_EQUAL);
3302     // clean upper bits of YMM registers
3303     vpxor(vec1, vec1);
3304     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3305       lea(str1, Address(str1, result, scale));
3306       lea(str2, Address(str2, result, scale));
3307     } else {
3308       lea(str1, Address(str1, result, scale1));
3309       lea(str2, Address(str2, result, scale2));
3310     }
3311     jmp(COMPARE_16_CHARS);
3312 
3313     // Compare tail chars, length between 1 to 15 chars
3314     bind(COMPARE_TAIL_LONG);
3315     movl(cnt2, result);
3316     cmpl(cnt2, stride);
3317     jcc(Assembler::less, COMPARE_SMALL_STR);
3318 
3319     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3320       movdqu(vec1, Address(str1, 0));
3321     } else {
3322       pmovzxbw(vec1, Address(str1, 0));
3323     }
3324     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3325     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3326     subptr(cnt2, stride);
3327     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3328     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3329       lea(str1, Address(str1, result, scale));
3330       lea(str2, Address(str2, result, scale));
3331     } else {
3332       lea(str1, Address(str1, result, scale1));
3333       lea(str2, Address(str2, result, scale2));
3334     }
3335     negptr(cnt2);
3336     jmpb(WHILE_HEAD_LABEL);
3337 
3338     bind(COMPARE_SMALL_STR);
3339   } else if (UseSSE42Intrinsics) {
3340     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3341     int pcmpmask = 0x19;
3342     // Setup to compare 8-char (16-byte) vectors,
3343     // start from first character again because it has aligned address.
3344     movl(result, cnt2);
3345     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3346     if (ae == StrIntrinsicNode::LL) {
3347       pcmpmask &= ~0x01;
3348     }
3349     jcc(Assembler::zero, COMPARE_TAIL);
3350     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3351       lea(str1, Address(str1, result, scale));
3352       lea(str2, Address(str2, result, scale));
3353     } else {
3354       lea(str1, Address(str1, result, scale1));
3355       lea(str2, Address(str2, result, scale2));
3356     }
3357     negptr(result);
3358 
3359     // pcmpestri
3360     //   inputs:
3361     //     vec1- substring
3362     //     rax - negative string length (elements count)
3363     //     mem - scanned string
3364     //     rdx - string length (elements count)
3365     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3366     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3367     //   outputs:
3368     //     rcx - first mismatched element index
3369     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3370 
3371     bind(COMPARE_WIDE_VECTORS);
3372     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3373       movdqu(vec1, Address(str1, result, scale));
3374       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3375     } else {
3376       pmovzxbw(vec1, Address(str1, result, scale1));
3377       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3378     }
3379     // After pcmpestri cnt1(rcx) contains mismatched element index
3380 
3381     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3382     addptr(result, stride);
3383     subptr(cnt2, stride);
3384     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3385 
3386     // compare wide vectors tail
3387     testptr(result, result);
3388     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3389 
3390     movl(cnt2, stride);
3391     movl(result, stride);
3392     negptr(result);
3393     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3394       movdqu(vec1, Address(str1, result, scale));
3395       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3396     } else {
3397       pmovzxbw(vec1, Address(str1, result, scale1));
3398       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3399     }
3400     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3401 
3402     // Mismatched characters in the vectors
3403     bind(VECTOR_NOT_EQUAL);
3404     addptr(cnt1, result);
3405     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3406     subl(result, cnt2);
3407     jmpb(POP_LABEL);
3408 
3409     bind(COMPARE_TAIL); // limit is zero
3410     movl(cnt2, result);
3411     // Fallthru to tail compare
3412   }
3413   // Shift str2 and str1 to the end of the arrays, negate min
3414   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3415     lea(str1, Address(str1, cnt2, scale));
3416     lea(str2, Address(str2, cnt2, scale));
3417   } else {
3418     lea(str1, Address(str1, cnt2, scale1));
3419     lea(str2, Address(str2, cnt2, scale2));
3420   }
3421   decrementl(cnt2);  // first character was compared already
3422   negptr(cnt2);
3423 
3424   // Compare the rest of the elements
3425   bind(WHILE_HEAD_LABEL);
3426   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3427   subl(result, cnt1);
3428   jccb(Assembler::notZero, POP_LABEL);
3429   increment(cnt2);
3430   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3431 
3432   // Strings are equal up to min length.  Return the length difference.
3433   bind(LENGTH_DIFF_LABEL);
3434   pop(result);
3435   if (ae == StrIntrinsicNode::UU) {
3436     // Divide diff by 2 to get number of chars
3437     sarl(result, 1);
3438   }
3439   jmpb(DONE_LABEL);
3440 
3441 #ifdef _LP64
3442   if (VM_Version::supports_avx512vlbw()) {
3443 
3444     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3445 
3446     kmovql(cnt1, mask);
3447     notq(cnt1);
3448     bsfq(cnt2, cnt1);
3449     if (ae != StrIntrinsicNode::LL) {
3450       // Divide diff by 2 to get number of chars
3451       sarl(cnt2, 1);
3452     }
3453     addq(result, cnt2);
3454     if (ae == StrIntrinsicNode::LL) {
3455       load_unsigned_byte(cnt1, Address(str2, result));
3456       load_unsigned_byte(result, Address(str1, result));
3457     } else if (ae == StrIntrinsicNode::UU) {
3458       load_unsigned_short(cnt1, Address(str2, result, scale));
3459       load_unsigned_short(result, Address(str1, result, scale));
3460     } else {
3461       load_unsigned_short(cnt1, Address(str2, result, scale2));
3462       load_unsigned_byte(result, Address(str1, result, scale1));
3463     }
3464     subl(result, cnt1);
3465     jmpb(POP_LABEL);
3466   }//if (VM_Version::supports_avx512vlbw())
3467 #endif // _LP64
3468 
3469   // Discard the stored length difference
3470   bind(POP_LABEL);
3471   pop(cnt1);
3472 
3473   // That's it
3474   bind(DONE_LABEL);
3475   if(ae == StrIntrinsicNode::UL) {
3476     negl(result);
3477   }
3478 
3479 }
3480 
3481 // Search for Non-ASCII character (Negative byte value) in a byte array,
3482 // return true if it has any and false otherwise.
3483 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3484 //   @IntrinsicCandidate
3485 //   private static boolean hasNegatives(byte[] ba, int off, int len) {
3486 //     for (int i = off; i < off + len; i++) {
3487 //       if (ba[i] < 0) {
3488 //         return true;
3489 //       }
3490 //     }
3491 //     return false;
3492 //   }
3493 void C2_MacroAssembler::has_negatives(Register ary1, Register len,
3494   Register result, Register tmp1,
3495   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3496   // rsi: byte array
3497   // rcx: len
3498   // rax: result
3499   ShortBranchVerifier sbv(this);
3500   assert_different_registers(ary1, len, result, tmp1);
3501   assert_different_registers(vec1, vec2);
3502   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3503 
3504   // len == 0
3505   testl(len, len);
3506   jcc(Assembler::zero, FALSE_LABEL);
3507 
3508   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3509     VM_Version::supports_avx512vlbw() &&
3510     VM_Version::supports_bmi2()) {
3511 
3512     Label test_64_loop, test_tail;
3513     Register tmp3_aliased = len;
3514 
3515     movl(tmp1, len);
3516     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3517 
3518     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
3519     andl(len, ~(64 - 1));    // vector count (in chars)
3520     jccb(Assembler::zero, test_tail);
3521 
3522     lea(ary1, Address(ary1, len, Address::times_1));
3523     negptr(len);
3524 
3525     bind(test_64_loop);
3526     // Check whether our 64 elements of size byte contain negatives
3527     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3528     kortestql(mask1, mask1);
3529     jcc(Assembler::notZero, TRUE_LABEL);
3530 
3531     addptr(len, 64);
3532     jccb(Assembler::notZero, test_64_loop);
3533 
3534 
3535     bind(test_tail);
3536     // bail out when there is nothing to be done
3537     testl(tmp1, -1);
3538     jcc(Assembler::zero, FALSE_LABEL);
3539 
3540     // ~(~0 << len) applied up to two times (for 32-bit scenario)
3541 #ifdef _LP64
3542     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3543     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3544     notq(tmp3_aliased);
3545     kmovql(mask2, tmp3_aliased);
3546 #else
3547     Label k_init;
3548     jmp(k_init);
3549 
3550     // We could not read 64-bits from a general purpose register thus we move
3551     // data required to compose 64 1's to the instruction stream
3552     // We emit 64 byte wide series of elements from 0..63 which later on would
3553     // be used as a compare targets with tail count contained in tmp1 register.
3554     // Result would be a k register having tmp1 consecutive number or 1
3555     // counting from least significant bit.
3556     address tmp = pc();
3557     emit_int64(0x0706050403020100);
3558     emit_int64(0x0F0E0D0C0B0A0908);
3559     emit_int64(0x1716151413121110);
3560     emit_int64(0x1F1E1D1C1B1A1918);
3561     emit_int64(0x2726252423222120);
3562     emit_int64(0x2F2E2D2C2B2A2928);
3563     emit_int64(0x3736353433323130);
3564     emit_int64(0x3F3E3D3C3B3A3938);
3565 
3566     bind(k_init);
3567     lea(len, InternalAddress(tmp));
3568     // create mask to test for negative byte inside a vector
3569     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3570     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
3571 
3572 #endif
3573     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3574     ktestq(mask1, mask2);
3575     jcc(Assembler::notZero, TRUE_LABEL);
3576 
3577     jmp(FALSE_LABEL);
3578   } else {
3579     movl(result, len); // copy
3580 
3581     if (UseAVX >= 2 && UseSSE >= 2) {
3582       // With AVX2, use 32-byte vector compare
3583       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3584 
3585       // Compare 32-byte vectors
3586       andl(result, 0x0000001f);  //   tail count (in bytes)
3587       andl(len, 0xffffffe0);   // vector count (in bytes)
3588       jccb(Assembler::zero, COMPARE_TAIL);
3589 
3590       lea(ary1, Address(ary1, len, Address::times_1));
3591       negptr(len);
3592 
3593       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
3594       movdl(vec2, tmp1);
3595       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
3596 
3597       bind(COMPARE_WIDE_VECTORS);
3598       vmovdqu(vec1, Address(ary1, len, Address::times_1));
3599       vptest(vec1, vec2);
3600       jccb(Assembler::notZero, TRUE_LABEL);
3601       addptr(len, 32);
3602       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3603 
3604       testl(result, result);
3605       jccb(Assembler::zero, FALSE_LABEL);
3606 
3607       vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3608       vptest(vec1, vec2);
3609       jccb(Assembler::notZero, TRUE_LABEL);
3610       jmpb(FALSE_LABEL);
3611 
3612       bind(COMPARE_TAIL); // len is zero
3613       movl(len, result);
3614       // Fallthru to tail compare
3615     } else if (UseSSE42Intrinsics) {
3616       // With SSE4.2, use double quad vector compare
3617       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3618 
3619       // Compare 16-byte vectors
3620       andl(result, 0x0000000f);  //   tail count (in bytes)
3621       andl(len, 0xfffffff0);   // vector count (in bytes)
3622       jcc(Assembler::zero, COMPARE_TAIL);
3623 
3624       lea(ary1, Address(ary1, len, Address::times_1));
3625       negptr(len);
3626 
3627       movl(tmp1, 0x80808080);
3628       movdl(vec2, tmp1);
3629       pshufd(vec2, vec2, 0);
3630 
3631       bind(COMPARE_WIDE_VECTORS);
3632       movdqu(vec1, Address(ary1, len, Address::times_1));
3633       ptest(vec1, vec2);
3634       jcc(Assembler::notZero, TRUE_LABEL);
3635       addptr(len, 16);
3636       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3637 
3638       testl(result, result);
3639       jcc(Assembler::zero, FALSE_LABEL);
3640 
3641       movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3642       ptest(vec1, vec2);
3643       jccb(Assembler::notZero, TRUE_LABEL);
3644       jmpb(FALSE_LABEL);
3645 
3646       bind(COMPARE_TAIL); // len is zero
3647       movl(len, result);
3648       // Fallthru to tail compare
3649     }
3650   }
3651   // Compare 4-byte vectors
3652   andl(len, 0xfffffffc); // vector count (in bytes)
3653   jccb(Assembler::zero, COMPARE_CHAR);
3654 
3655   lea(ary1, Address(ary1, len, Address::times_1));
3656   negptr(len);
3657 
3658   bind(COMPARE_VECTORS);
3659   movl(tmp1, Address(ary1, len, Address::times_1));
3660   andl(tmp1, 0x80808080);
3661   jccb(Assembler::notZero, TRUE_LABEL);
3662   addptr(len, 4);
3663   jcc(Assembler::notZero, COMPARE_VECTORS);
3664 
3665   // Compare trailing char (final 2 bytes), if any
3666   bind(COMPARE_CHAR);
3667   testl(result, 0x2);   // tail  char
3668   jccb(Assembler::zero, COMPARE_BYTE);
3669   load_unsigned_short(tmp1, Address(ary1, 0));
3670   andl(tmp1, 0x00008080);
3671   jccb(Assembler::notZero, TRUE_LABEL);
3672   subptr(result, 2);
3673   lea(ary1, Address(ary1, 2));
3674 
3675   bind(COMPARE_BYTE);
3676   testl(result, 0x1);   // tail  byte
3677   jccb(Assembler::zero, FALSE_LABEL);
3678   load_unsigned_byte(tmp1, Address(ary1, 0));
3679   andl(tmp1, 0x00000080);
3680   jccb(Assembler::notEqual, TRUE_LABEL);
3681   jmpb(FALSE_LABEL);
3682 
3683   bind(TRUE_LABEL);
3684   movl(result, 1);   // return true
3685   jmpb(DONE);
3686 
3687   bind(FALSE_LABEL);
3688   xorl(result, result); // return false
3689 
3690   // That's it
3691   bind(DONE);
3692   if (UseAVX >= 2 && UseSSE >= 2) {
3693     // clean upper bits of YMM registers
3694     vpxor(vec1, vec1);
3695     vpxor(vec2, vec2);
3696   }
3697 }
3698 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
3699 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
3700                                       Register limit, Register result, Register chr,
3701                                       XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
3702   ShortBranchVerifier sbv(this);
3703   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
3704 
3705   int length_offset  = arrayOopDesc::length_offset_in_bytes();
3706   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
3707 
3708   if (is_array_equ) {
3709     // Check the input args
3710     cmpoop(ary1, ary2);
3711     jcc(Assembler::equal, TRUE_LABEL);
3712 
3713     // Need additional checks for arrays_equals.
3714     testptr(ary1, ary1);
3715     jcc(Assembler::zero, FALSE_LABEL);
3716     testptr(ary2, ary2);
3717     jcc(Assembler::zero, FALSE_LABEL);
3718 
3719     // Check the lengths
3720     movl(limit, Address(ary1, length_offset));
3721     cmpl(limit, Address(ary2, length_offset));
3722     jcc(Assembler::notEqual, FALSE_LABEL);
3723   }
3724 
3725   // count == 0
3726   testl(limit, limit);
3727   jcc(Assembler::zero, TRUE_LABEL);
3728 
3729   if (is_array_equ) {
3730     // Load array address
3731     lea(ary1, Address(ary1, base_offset));
3732     lea(ary2, Address(ary2, base_offset));
3733   }
3734 
3735   if (is_array_equ && is_char) {
3736     // arrays_equals when used for char[].
3737     shll(limit, 1);      // byte count != 0
3738   }
3739   movl(result, limit); // copy
3740 
3741   if (UseAVX >= 2) {
3742     // With AVX2, use 32-byte vector compare
3743     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3744 
3745     // Compare 32-byte vectors
3746     andl(result, 0x0000001f);  //   tail count (in bytes)
3747     andl(limit, 0xffffffe0);   // vector count (in bytes)
3748     jcc(Assembler::zero, COMPARE_TAIL);
3749 
3750     lea(ary1, Address(ary1, limit, Address::times_1));
3751     lea(ary2, Address(ary2, limit, Address::times_1));
3752     negptr(limit);
3753 
3754 #ifdef _LP64
3755     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3756       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
3757 
3758       cmpl(limit, -64);
3759       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3760 
3761       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3762 
3763       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
3764       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
3765       kortestql(mask, mask);
3766       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3767       addptr(limit, 64);  // update since we already compared at this addr
3768       cmpl(limit, -64);
3769       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3770 
3771       // At this point we may still need to compare -limit+result bytes.
3772       // We could execute the next two instruction and just continue via non-wide path:
3773       //  cmpl(limit, 0);
3774       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
3775       // But since we stopped at the points ary{1,2}+limit which are
3776       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
3777       // (|limit| <= 32 and result < 32),
3778       // we may just compare the last 64 bytes.
3779       //
3780       addptr(result, -64);   // it is safe, bc we just came from this area
3781       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
3782       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
3783       kortestql(mask, mask);
3784       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3785 
3786       jmp(TRUE_LABEL);
3787 
3788       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3789 
3790     }//if (VM_Version::supports_avx512vlbw())
3791 #endif //_LP64
3792     bind(COMPARE_WIDE_VECTORS);
3793     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
3794     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
3795     vpxor(vec1, vec2);
3796 
3797     vptest(vec1, vec1);
3798     jcc(Assembler::notZero, FALSE_LABEL);
3799     addptr(limit, 32);
3800     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3801 
3802     testl(result, result);
3803     jcc(Assembler::zero, TRUE_LABEL);
3804 
3805     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3806     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
3807     vpxor(vec1, vec2);
3808 
3809     vptest(vec1, vec1);
3810     jccb(Assembler::notZero, FALSE_LABEL);
3811     jmpb(TRUE_LABEL);
3812 
3813     bind(COMPARE_TAIL); // limit is zero
3814     movl(limit, result);
3815     // Fallthru to tail compare
3816   } else if (UseSSE42Intrinsics) {
3817     // With SSE4.2, use double quad vector compare
3818     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3819 
3820     // Compare 16-byte vectors
3821     andl(result, 0x0000000f);  //   tail count (in bytes)
3822     andl(limit, 0xfffffff0);   // vector count (in bytes)
3823     jcc(Assembler::zero, COMPARE_TAIL);
3824 
3825     lea(ary1, Address(ary1, limit, Address::times_1));
3826     lea(ary2, Address(ary2, limit, Address::times_1));
3827     negptr(limit);
3828 
3829     bind(COMPARE_WIDE_VECTORS);
3830     movdqu(vec1, Address(ary1, limit, Address::times_1));
3831     movdqu(vec2, Address(ary2, limit, Address::times_1));
3832     pxor(vec1, vec2);
3833 
3834     ptest(vec1, vec1);
3835     jcc(Assembler::notZero, FALSE_LABEL);
3836     addptr(limit, 16);
3837     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3838 
3839     testl(result, result);
3840     jcc(Assembler::zero, TRUE_LABEL);
3841 
3842     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3843     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
3844     pxor(vec1, vec2);
3845 
3846     ptest(vec1, vec1);
3847     jccb(Assembler::notZero, FALSE_LABEL);
3848     jmpb(TRUE_LABEL);
3849 
3850     bind(COMPARE_TAIL); // limit is zero
3851     movl(limit, result);
3852     // Fallthru to tail compare
3853   }
3854 
3855   // Compare 4-byte vectors
3856   andl(limit, 0xfffffffc); // vector count (in bytes)
3857   jccb(Assembler::zero, COMPARE_CHAR);
3858 
3859   lea(ary1, Address(ary1, limit, Address::times_1));
3860   lea(ary2, Address(ary2, limit, Address::times_1));
3861   negptr(limit);
3862 
3863   bind(COMPARE_VECTORS);
3864   movl(chr, Address(ary1, limit, Address::times_1));
3865   cmpl(chr, Address(ary2, limit, Address::times_1));
3866   jccb(Assembler::notEqual, FALSE_LABEL);
3867   addptr(limit, 4);
3868   jcc(Assembler::notZero, COMPARE_VECTORS);
3869 
3870   // Compare trailing char (final 2 bytes), if any
3871   bind(COMPARE_CHAR);
3872   testl(result, 0x2);   // tail  char
3873   jccb(Assembler::zero, COMPARE_BYTE);
3874   load_unsigned_short(chr, Address(ary1, 0));
3875   load_unsigned_short(limit, Address(ary2, 0));
3876   cmpl(chr, limit);
3877   jccb(Assembler::notEqual, FALSE_LABEL);
3878 
3879   if (is_array_equ && is_char) {
3880     bind(COMPARE_BYTE);
3881   } else {
3882     lea(ary1, Address(ary1, 2));
3883     lea(ary2, Address(ary2, 2));
3884 
3885     bind(COMPARE_BYTE);
3886     testl(result, 0x1);   // tail  byte
3887     jccb(Assembler::zero, TRUE_LABEL);
3888     load_unsigned_byte(chr, Address(ary1, 0));
3889     load_unsigned_byte(limit, Address(ary2, 0));
3890     cmpl(chr, limit);
3891     jccb(Assembler::notEqual, FALSE_LABEL);
3892   }
3893   bind(TRUE_LABEL);
3894   movl(result, 1);   // return true
3895   jmpb(DONE);
3896 
3897   bind(FALSE_LABEL);
3898   xorl(result, result); // return false
3899 
3900   // That's it
3901   bind(DONE);
3902   if (UseAVX >= 2) {
3903     // clean upper bits of YMM registers
3904     vpxor(vec1, vec1);
3905     vpxor(vec2, vec2);
3906   }
3907 }
3908 
3909 #ifdef _LP64
3910 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
3911                                               Register tmp, KRegister ktmp, int masklen, int vec_enc) {
3912   assert(VM_Version::supports_avx512vlbw(), "");
3913   vpxor(xtmp, xtmp, xtmp, vec_enc);
3914   vpsubb(xtmp, xtmp, mask, vec_enc);
3915   evpmovb2m(ktmp, xtmp, vec_enc);
3916   kmovql(tmp, ktmp);
3917   switch(opc) {
3918     case Op_VectorMaskTrueCount:
3919       popcntq(dst, tmp);
3920       break;
3921     case Op_VectorMaskLastTrue:
3922       mov64(dst, -1);
3923       bsrq(tmp, tmp);
3924       cmov(Assembler::notZero, dst, tmp);
3925       break;
3926     case Op_VectorMaskFirstTrue:
3927       mov64(dst, masklen);
3928       bsfq(tmp, tmp);
3929       cmov(Assembler::notZero, dst, tmp);
3930       break;
3931     default: assert(false, "Unhandled mask operation");
3932   }
3933 }
3934 
3935 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
3936                                               XMMRegister xtmp1, Register tmp, int masklen, int vec_enc) {
3937   assert(VM_Version::supports_avx(), "");
3938   vpxor(xtmp, xtmp, xtmp, vec_enc);
3939   vpsubb(xtmp, xtmp, mask, vec_enc);
3940   vpmovmskb(tmp, xtmp, vec_enc);
3941   if (masklen < 64) {
3942     andq(tmp, (((jlong)1 << masklen) - 1));
3943   }
3944   switch(opc) {
3945     case Op_VectorMaskTrueCount:
3946       popcntq(dst, tmp);
3947       break;
3948     case Op_VectorMaskLastTrue:
3949       mov64(dst, -1);
3950       bsrq(tmp, tmp);
3951       cmov(Assembler::notZero, dst, tmp);
3952       break;
3953     case Op_VectorMaskFirstTrue:
3954       mov64(dst, masklen);
3955       bsfq(tmp, tmp);
3956       cmov(Assembler::notZero, dst, tmp);
3957       break;
3958     default: assert(false, "Unhandled mask operation");
3959   }
3960 }
3961 #endif
3962 
3963 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
3964                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
3965                                         int vlen_enc) {
3966   assert(VM_Version::supports_avx512bw(), "");
3967   // Byte shuffles are inlane operations and indices are determined using
3968   // lower 4 bit of each shuffle lane, thus all shuffle indices are
3969   // normalized to index range 0-15. This makes sure that all the multiples
3970   // of an index value are placed at same relative position in 128 bit
3971   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
3972   // will be 16th element in their respective 128 bit lanes.
3973   movl(rtmp, 16);
3974   evpbroadcastb(xtmp1, rtmp, vlen_enc);
3975 
3976   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
3977   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
3978   // original shuffle indices and move the shuffled lanes corresponding to true
3979   // mask to destination vector.
3980   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
3981   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
3982   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
3983 
3984   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
3985   // and broadcasting second 128 bit lane.
3986   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
3987   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
3988   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
3989   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
3990   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
3991 
3992   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
3993   // and broadcasting third 128 bit lane.
3994   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
3995   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
3996   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
3997   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
3998   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
3999 
4000   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
4001   // and broadcasting third 128 bit lane.
4002   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
4003   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
4004   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
4005   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
4006   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
4007 }