1 /*
   2  * Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "oops/methodData.hpp"
  29 #include "opto/c2_CodeStubs.hpp"
  30 #include "opto/c2_MacroAssembler.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/opcodes.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/subnode.hpp"
  35 #include "runtime/biasedLocking.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 
  39 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
  40   switch (vlen_in_bytes) {
  41     case  4: // fall-through
  42     case  8: // fall-through
  43     case 16: return Assembler::AVX_128bit;
  44     case 32: return Assembler::AVX_256bit;
  45     case 64: return Assembler::AVX_512bit;
  46 
  47     default: {
  48       ShouldNotReachHere();
  49       return Assembler::AVX_NoVec;
  50     }
  51   }
  52 }
  53 
  54 void C2_MacroAssembler::setvectmask(Register dst, Register src, KRegister mask) {
  55   guarantee(PostLoopMultiversioning, "must be");
  56   Assembler::movl(dst, 1);
  57   Assembler::shlxl(dst, dst, src);
  58   Assembler::decl(dst);
  59   Assembler::kmovdl(mask, dst);
  60   Assembler::movl(dst, src);
  61 }
  62 
  63 void C2_MacroAssembler::restorevectmask(KRegister mask) {
  64   guarantee(PostLoopMultiversioning, "must be");
  65   Assembler::knotwl(mask, k0);
  66 }
  67 
  68 #if INCLUDE_RTM_OPT
  69 
  70 // Update rtm_counters based on abort status
  71 // input: abort_status
  72 //        rtm_counters (RTMLockingCounters*)
  73 // flags are killed
  74 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
  75 
  76   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
  77   if (PrintPreciseRTMLockingStatistics) {
  78     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
  79       Label check_abort;
  80       testl(abort_status, (1<<i));
  81       jccb(Assembler::equal, check_abort);
  82       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
  83       bind(check_abort);
  84     }
  85   }
  86 }
  87 
  88 // Branch if (random & (count-1) != 0), count is 2^n
  89 // tmp, scr and flags are killed
  90 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
  91   assert(tmp == rax, "");
  92   assert(scr == rdx, "");
  93   rdtsc(); // modifies EDX:EAX
  94   andptr(tmp, count-1);
  95   jccb(Assembler::notZero, brLabel);
  96 }
  97 
  98 // Perform abort ratio calculation, set no_rtm bit if high ratio
  99 // input:  rtm_counters_Reg (RTMLockingCounters* address)
 100 // tmpReg, rtm_counters_Reg and flags are killed
 101 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
 102                                                     Register rtm_counters_Reg,
 103                                                     RTMLockingCounters* rtm_counters,
 104                                                     Metadata* method_data) {
 105   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 106 
 107   if (RTMLockingCalculationDelay > 0) {
 108     // Delay calculation
 109     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
 110     testptr(tmpReg, tmpReg);
 111     jccb(Assembler::equal, L_done);
 112   }
 113   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 114   //   Aborted transactions = abort_count * 100
 115   //   All transactions = total_count *  RTMTotalCountIncrRate
 116   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 117 
 118   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 119   cmpptr(tmpReg, RTMAbortThreshold);
 120   jccb(Assembler::below, L_check_always_rtm2);
 121   imulptr(tmpReg, tmpReg, 100);
 122 
 123   Register scrReg = rtm_counters_Reg;
 124   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 125   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 126   imulptr(scrReg, scrReg, RTMAbortRatio);
 127   cmpptr(tmpReg, scrReg);
 128   jccb(Assembler::below, L_check_always_rtm1);
 129   if (method_data != NULL) {
 130     // set rtm_state to "no rtm" in MDO
 131     mov_metadata(tmpReg, method_data);
 132     lock();
 133     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
 134   }
 135   jmpb(L_done);
 136   bind(L_check_always_rtm1);
 137   // Reload RTMLockingCounters* address
 138   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 139   bind(L_check_always_rtm2);
 140   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 141   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 142   jccb(Assembler::below, L_done);
 143   if (method_data != NULL) {
 144     // set rtm_state to "always rtm" in MDO
 145     mov_metadata(tmpReg, method_data);
 146     lock();
 147     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
 148   }
 149   bind(L_done);
 150 }
 151 
 152 // Update counters and perform abort ratio calculation
 153 // input:  abort_status_Reg
 154 // rtm_counters_Reg, flags are killed
 155 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 156                                       Register rtm_counters_Reg,
 157                                       RTMLockingCounters* rtm_counters,
 158                                       Metadata* method_data,
 159                                       bool profile_rtm) {
 160 
 161   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 162   // update rtm counters based on rax value at abort
 163   // reads abort_status_Reg, updates flags
 164   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 165   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 166   if (profile_rtm) {
 167     // Save abort status because abort_status_Reg is used by following code.
 168     if (RTMRetryCount > 0) {
 169       push(abort_status_Reg);
 170     }
 171     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 172     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 173     // restore abort status
 174     if (RTMRetryCount > 0) {
 175       pop(abort_status_Reg);
 176     }
 177   }
 178 }
 179 
 180 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 181 // inputs: retry_count_Reg
 182 //       : abort_status_Reg
 183 // output: retry_count_Reg decremented by 1
 184 // flags are killed
 185 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 186   Label doneRetry;
 187   assert(abort_status_Reg == rax, "");
 188   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 189   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 190   // if reason is in 0x6 and retry count != 0 then retry
 191   andptr(abort_status_Reg, 0x6);
 192   jccb(Assembler::zero, doneRetry);
 193   testl(retry_count_Reg, retry_count_Reg);
 194   jccb(Assembler::zero, doneRetry);
 195   pause();
 196   decrementl(retry_count_Reg);
 197   jmp(retryLabel);
 198   bind(doneRetry);
 199 }
 200 
 201 // Spin and retry if lock is busy,
 202 // inputs: box_Reg (monitor address)
 203 //       : retry_count_Reg
 204 // output: retry_count_Reg decremented by 1
 205 //       : clear z flag if retry count exceeded
 206 // tmp_Reg, scr_Reg, flags are killed
 207 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 208                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 209   Label SpinLoop, SpinExit, doneRetry;
 210   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 211 
 212   testl(retry_count_Reg, retry_count_Reg);
 213   jccb(Assembler::zero, doneRetry);
 214   decrementl(retry_count_Reg);
 215   movptr(scr_Reg, RTMSpinLoopCount);
 216 
 217   bind(SpinLoop);
 218   pause();
 219   decrementl(scr_Reg);
 220   jccb(Assembler::lessEqual, SpinExit);
 221   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 222   testptr(tmp_Reg, tmp_Reg);
 223   jccb(Assembler::notZero, SpinLoop);
 224 
 225   bind(SpinExit);
 226   jmp(retryLabel);
 227   bind(doneRetry);
 228   incrementl(retry_count_Reg); // clear z flag
 229 }
 230 
 231 // Use RTM for normal stack locks
 232 // Input: objReg (object to lock)
 233 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 234                                          Register retry_on_abort_count_Reg,
 235                                          RTMLockingCounters* stack_rtm_counters,
 236                                          Metadata* method_data, bool profile_rtm,
 237                                          Label& DONE_LABEL, Label& IsInflated) {
 238   assert(UseRTMForStackLocks, "why call this otherwise?");
 239   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
 240   assert(tmpReg == rax, "");
 241   assert(scrReg == rdx, "");
 242   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 243 
 244   if (RTMRetryCount > 0) {
 245     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 246     bind(L_rtm_retry);
 247   }
 248   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 249   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral|biased
 250   jcc(Assembler::notZero, IsInflated);
 251 
 252   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 253     Label L_noincrement;
 254     if (RTMTotalCountIncrRate > 1) {
 255       // tmpReg, scrReg and flags are killed
 256       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 257     }
 258     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
 259     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 260     bind(L_noincrement);
 261   }
 262   xbegin(L_on_abort);
 263   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 264   andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits
 265   cmpptr(tmpReg, markWord::unlocked_value);            // bits = 001 unlocked
 266   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 267 
 268   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 269   if (UseRTMXendForLockBusy) {
 270     xend();
 271     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 272     jmp(L_decrement_retry);
 273   }
 274   else {
 275     xabort(0);
 276   }
 277   bind(L_on_abort);
 278   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 279     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 280   }
 281   bind(L_decrement_retry);
 282   if (RTMRetryCount > 0) {
 283     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 284     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 285   }
 286 }
 287 
 288 // Use RTM for inflating locks
 289 // inputs: objReg (object to lock)
 290 //         boxReg (on-stack box address (displaced header location) - KILLED)
 291 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 292 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 293                                             Register scrReg, Register retry_on_busy_count_Reg,
 294                                             Register retry_on_abort_count_Reg,
 295                                             RTMLockingCounters* rtm_counters,
 296                                             Metadata* method_data, bool profile_rtm,
 297                                             Label& DONE_LABEL) {
 298   assert(UseRTMLocking, "why call this otherwise?");
 299   assert(tmpReg == rax, "");
 300   assert(scrReg == rdx, "");
 301   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 302   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 303 
 304   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 305   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
 306   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 307 
 308   if (RTMRetryCount > 0) {
 309     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 310     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 311     bind(L_rtm_retry);
 312   }
 313   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 314     Label L_noincrement;
 315     if (RTMTotalCountIncrRate > 1) {
 316       // tmpReg, scrReg and flags are killed
 317       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 318     }
 319     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 320     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 321     bind(L_noincrement);
 322   }
 323   xbegin(L_on_abort);
 324   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 325   movptr(tmpReg, Address(tmpReg, owner_offset));
 326   testptr(tmpReg, tmpReg);
 327   jcc(Assembler::zero, DONE_LABEL);
 328   if (UseRTMXendForLockBusy) {
 329     xend();
 330     jmp(L_decrement_retry);
 331   }
 332   else {
 333     xabort(0);
 334   }
 335   bind(L_on_abort);
 336   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 337   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 338     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 339   }
 340   if (RTMRetryCount > 0) {
 341     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 342     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 343   }
 344 
 345   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 346   testptr(tmpReg, tmpReg) ;
 347   jccb(Assembler::notZero, L_decrement_retry) ;
 348 
 349   // Appears unlocked - try to swing _owner from null to non-null.
 350   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 351 #ifdef _LP64
 352   Register threadReg = r15_thread;
 353 #else
 354   get_thread(scrReg);
 355   Register threadReg = scrReg;
 356 #endif
 357   lock();
 358   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 359 
 360   if (RTMRetryCount > 0) {
 361     // success done else retry
 362     jccb(Assembler::equal, DONE_LABEL) ;
 363     bind(L_decrement_retry);
 364     // Spin and retry if lock is busy.
 365     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 366   }
 367   else {
 368     bind(L_decrement_retry);
 369   }
 370 }
 371 
 372 #endif //  INCLUDE_RTM_OPT
 373 
 374 // fast_lock and fast_unlock used by C2
 375 
 376 // Because the transitions from emitted code to the runtime
 377 // monitorenter/exit helper stubs are so slow it's critical that
 378 // we inline both the stack-locking fast path and the inflated fast path.
 379 //
 380 // See also: cmpFastLock and cmpFastUnlock.
 381 //
 382 // What follows is a specialized inline transliteration of the code
 383 // in enter() and exit(). If we're concerned about I$ bloat another
 384 // option would be to emit TrySlowEnter and TrySlowExit methods
 385 // at startup-time.  These methods would accept arguments as
 386 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 387 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 388 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 389 // In practice, however, the # of lock sites is bounded and is usually small.
 390 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 391 // if the processor uses simple bimodal branch predictors keyed by EIP
 392 // Since the helper routines would be called from multiple synchronization
 393 // sites.
 394 //
 395 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 396 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 397 // to those specialized methods.  That'd give us a mostly platform-independent
 398 // implementation that the JITs could optimize and inline at their pleasure.
 399 // Done correctly, the only time we'd need to cross to native could would be
 400 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 401 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 402 // (b) explicit barriers or fence operations.
 403 //
 404 // TODO:
 405 //
 406 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 407 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 408 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 409 //    the lock operators would typically be faster than reifying Self.
 410 //
 411 // *  Ideally I'd define the primitives as:
 412 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 413 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 414 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 415 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 416 //    Furthermore the register assignments are overconstrained, possibly resulting in
 417 //    sub-optimal code near the synchronization site.
 418 //
 419 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 420 //    Alternately, use a better sp-proximity test.
 421 //
 422 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 423 //    Either one is sufficient to uniquely identify a thread.
 424 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 425 //
 426 // *  Intrinsify notify() and notifyAll() for the common cases where the
 427 //    object is locked by the calling thread but the waitlist is empty.
 428 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 429 //
 430 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 431 //    But beware of excessive branch density on AMD Opterons.
 432 //
 433 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 434 //    or failure of the fast path.  If the fast path fails then we pass
 435 //    control to the slow path, typically in C.  In fast_lock and
 436 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 437 //    will emit a conditional branch immediately after the node.
 438 //    So we have branches to branches and lots of ICC.ZF games.
 439 //    Instead, it might be better to have C2 pass a "FailureLabel"
 440 //    into fast_lock and fast_unlock.  In the case of success, control
 441 //    will drop through the node.  ICC.ZF is undefined at exit.
 442 //    In the case of failure, the node will branch directly to the
 443 //    FailureLabel
 444 
 445 
 446 // obj: object to lock
 447 // box: on-stack box address (displaced header location) - KILLED
 448 // rax,: tmp -- KILLED
 449 // scr: tmp -- KILLED
 450 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 451                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 452                                  BiasedLockingCounters* counters,
 453                                  RTMLockingCounters* rtm_counters,
 454                                  RTMLockingCounters* stack_rtm_counters,
 455                                  Metadata* method_data,
 456                                  bool use_rtm, bool profile_rtm) {
 457   // Ensure the register assignments are disjoint
 458   assert(tmpReg == rax, "");
 459 
 460   if (use_rtm) {
 461     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 462   } else {
 463     assert(cx2Reg == noreg, "");
 464     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 465   }
 466 
 467   if (counters != NULL) {
 468     atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
 469   }
 470 
 471   // Possible cases that we'll encounter in fast_lock
 472   // ------------------------------------------------
 473   // * Inflated
 474   //    -- unlocked
 475   //    -- Locked
 476   //       = by self
 477   //       = by other
 478   // * biased
 479   //    -- by Self
 480   //    -- by other
 481   // * neutral
 482   // * stack-locked
 483   //    -- by self
 484   //       = sp-proximity test hits
 485   //       = sp-proximity test generates false-negative
 486   //    -- by other
 487   //
 488 
 489   Label IsInflated, DONE_LABEL;
 490 
 491   if (DiagnoseSyncOnValueBasedClasses != 0) {
 492     load_klass(tmpReg, objReg, cx1Reg);
 493     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 494     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 495     jcc(Assembler::notZero, DONE_LABEL);
 496   }
 497 
 498   // it's stack-locked, biased or neutral
 499   // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
 500   // order to reduce the number of conditional branches in the most common cases.
 501   // Beware -- there's a subtle invariant that fetch of the markword
 502   // at [FETCH], below, will never observe a biased encoding (*101b).
 503   // If this invariant is not held we risk exclusion (safety) failure.
 504   if (UseBiasedLocking && !UseOptoBiasInlining) {
 505     biased_locking_enter(boxReg, objReg, tmpReg, scrReg, cx1Reg, false, DONE_LABEL, NULL, counters);
 506   }
 507 
 508 #if INCLUDE_RTM_OPT
 509   if (UseRTMForStackLocks && use_rtm) {
 510     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 511                       stack_rtm_counters, method_data, profile_rtm,
 512                       DONE_LABEL, IsInflated);
 513   }
 514 #endif // INCLUDE_RTM_OPT
 515 
 516   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 517   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased
 518   jccb(Assembler::notZero, IsInflated);
 519 
 520   if (UseFastLocking) {
 521 #ifdef _LP64
 522     fast_lock_impl(objReg, tmpReg, thread, scrReg, DONE_LABEL, false);
 523     xorl(tmpReg, tmpReg); // Set ZF=1 to indicate success
 524 #else
 525     // We can not emit the lock-stack-check in verified_entry() because we don't have enough
 526     // registers (for thread ptr). Therefor we have to emit the lock-stack-check in
 527     // fast_lock_impl(). However, that check can take a slow-path with ZF=1, therefore
 528     // we need to handle it specially and force ZF=0 before taking the actual slow-path.
 529     Label slow;
 530     fast_lock_impl(objReg, tmpReg, thread, scrReg, slow);
 531     xorl(tmpReg, tmpReg);
 532     jmp(DONE_LABEL);
 533     bind(slow);
 534     testptr(objReg, objReg); // ZF=0 to indicate failure
 535 #endif
 536   } else {
 537     // Attempt stack-locking ...
 538     orptr (tmpReg, markWord::unlocked_value);
 539     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 540     lock();
 541     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 542     if (counters != NULL) {
 543       cond_inc32(Assembler::equal,
 544                  ExternalAddress((address)counters->fast_path_entry_count_addr()));
 545     }
 546     jcc(Assembler::equal, DONE_LABEL);           // Success
 547 
 548     // Recursive locking.
 549     // The object is stack-locked: markword contains stack pointer to BasicLock.
 550     // Locked by current thread if difference with current SP is less than one page.
 551     subptr(tmpReg, rsp);
 552     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 553     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
 554     movptr(Address(boxReg, 0), tmpReg);
 555     if (counters != NULL) {
 556       cond_inc32(Assembler::equal,
 557                  ExternalAddress((address)counters->fast_path_entry_count_addr()));
 558     }
 559   }
 560   jmp(DONE_LABEL);
 561 
 562   bind(IsInflated);
 563   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 564 
 565 #if INCLUDE_RTM_OPT
 566   // Use the same RTM locking code in 32- and 64-bit VM.
 567   if (use_rtm) {
 568     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 569                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 570   } else {
 571 #endif // INCLUDE_RTM_OPT
 572 
 573 #ifndef _LP64
 574   // The object is inflated.
 575 
 576   // boxReg refers to the on-stack BasicLock in the current frame.
 577   // We'd like to write:
 578   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 579   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 580   // additional latency as we have another ST in the store buffer that must drain.
 581 
 582   // avoid ST-before-CAS
 583   // register juggle because we need tmpReg for cmpxchgptr below
 584   movptr(scrReg, boxReg);
 585   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 586 
 587   // Optimistic form: consider XORL tmpReg,tmpReg
 588   movptr(tmpReg, NULL_WORD);
 589 
 590   // Appears unlocked - try to swing _owner from null to non-null.
 591   // Ideally, I'd manifest "Self" with get_thread and then attempt
 592   // to CAS the register containing Self into m->Owner.
 593   // But we don't have enough registers, so instead we can either try to CAS
 594   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 595   // we later store "Self" into m->Owner.  Transiently storing a stack address
 596   // (rsp or the address of the box) into  m->owner is harmless.
 597   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 598   lock();
 599   cmpxchgptr(thread, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 600   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 601   // If the CAS fails we can either retry or pass control to the slow path.
 602   // We use the latter tactic.
 603   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 604   // If the CAS was successful ...
 605   //   Self has acquired the lock
 606   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 607   // Intentional fall-through into DONE_LABEL ...
 608 #else // _LP64
 609   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 610   movq(scrReg, tmpReg);
 611   xorq(tmpReg, tmpReg);
 612   lock();
 613   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 614   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 615   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 616   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
 617   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 618   jcc(Assembler::equal, DONE_LABEL);           // CAS above succeeded; propagate ZF = 1 (success)
 619 
 620   cmpptr(r15_thread, rax);                     // Check if we are already the owner (recursive lock)
 621   jcc(Assembler::notEqual, DONE_LABEL);        // If not recursive, ZF = 0 at this point (fail)
 622   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 623   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 624 #endif // _LP64
 625 #if INCLUDE_RTM_OPT
 626   } // use_rtm()
 627 #endif
 628   // DONE_LABEL is a hot target - we'd really like to place it at the
 629   // start of cache line by padding with NOPs.
 630   // See the AMD and Intel software optimization manuals for the
 631   // most efficient "long" NOP encodings.
 632   // Unfortunately none of our alignment mechanisms suffice.
 633   bind(DONE_LABEL);
 634 
 635   // At DONE_LABEL the icc ZFlag is set as follows ...
 636   // fast_unlock uses the same protocol.
 637   // ZFlag == 1 -> Success
 638   // ZFlag == 0 -> Failure - force control through the slow path
 639 }
 640 
 641 // obj: object to unlock
 642 // box: box address (displaced header location), killed.  Must be EAX.
 643 // tmp: killed, cannot be obj nor box.
 644 //
 645 // Some commentary on balanced locking:
 646 //
 647 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 648 // Methods that don't have provably balanced locking are forced to run in the
 649 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 650 // The interpreter provides two properties:
 651 // I1:  At return-time the interpreter automatically and quietly unlocks any
 652 //      objects acquired the current activation (frame).  Recall that the
 653 //      interpreter maintains an on-stack list of locks currently held by
 654 //      a frame.
 655 // I2:  If a method attempts to unlock an object that is not held by the
 656 //      the frame the interpreter throws IMSX.
 657 //
 658 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 659 // B() doesn't have provably balanced locking so it runs in the interpreter.
 660 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 661 // is still locked by A().
 662 //
 663 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 664 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 665 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 666 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 667 // Arguably given that the spec legislates the JNI case as undefined our implementation
 668 // could reasonably *avoid* checking owner in fast_unlock().
 669 // In the interest of performance we elide m->Owner==Self check in unlock.
 670 // A perfectly viable alternative is to elide the owner check except when
 671 // Xcheck:jni is enabled.
 672 
 673 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 674   assert(boxReg == rax, "");
 675   assert_different_registers(objReg, boxReg, tmpReg);
 676 
 677   Label DONE_LABEL, Stacked, CheckSucc;
 678 
 679   // Critically, the biased locking test must have precedence over
 680   // and appear before the (box->dhw == 0) recursive stack-lock test.
 681   if (UseBiasedLocking && !UseOptoBiasInlining) {
 682     biased_locking_exit(objReg, tmpReg, DONE_LABEL);
 683   }
 684 
 685 #if INCLUDE_RTM_OPT
 686   if (UseRTMForStackLocks && use_rtm) {
 687     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
 688     Label L_regular_unlock;
 689     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 690     andptr(tmpReg, markWord::biased_lock_mask_in_place);              // look at 3 lock bits
 691     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 001 unlocked
 692     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 693     xend();                                                           // otherwise end...
 694     jmp(DONE_LABEL);                                                  // ... and we're done
 695     bind(L_regular_unlock);
 696   }
 697 #endif
 698 
 699   if (!UseFastLocking) {
 700     cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD);                   // Examine the displaced header
 701     jcc   (Assembler::zero, DONE_LABEL);                              // 0 indicates recursive stack-lock
 702   }
 703   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
 704   testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 705   jcc(Assembler::zero, Stacked);
 706 
 707   if (UseFastLocking) {
 708     // If the owner is ANONYMOUS, we need to fix it.
 709     testb(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int) (intptr_t) ANONYMOUS_OWNER);
 710 #ifdef _LP64
 711     C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmpReg);
 712     Compile::current()->output()->add_stub(stub);
 713     jcc(Assembler::notEqual, stub->entry());
 714     bind(stub->continuation());
 715 #else
 716     // We can't easily implement this optimization on 32 bit because we don't have a thread register.
 717     // Call the slow-path instead.
 718     jcc(Assembler::notEqual, DONE_LABEL);
 719 #endif
 720   }
 721 
 722   // It's inflated.
 723 #if INCLUDE_RTM_OPT
 724   if (use_rtm) {
 725     Label L_regular_inflated_unlock;
 726     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 727     movptr(boxReg, Address(tmpReg, owner_offset));
 728     testptr(boxReg, boxReg);
 729     jccb(Assembler::notZero, L_regular_inflated_unlock);
 730     xend();
 731     jmpb(DONE_LABEL);
 732     bind(L_regular_inflated_unlock);
 733   }
 734 #endif
 735 
 736   // Despite our balanced locking property we still check that m->_owner == Self
 737   // as java routines or native JNI code called by this thread might
 738   // have released the lock.
 739   // Refer to the comments in synchronizer.cpp for how we might encode extra
 740   // state in _succ so we can avoid fetching EntryList|cxq.
 741   //
 742   // If there's no contention try a 1-0 exit.  That is, exit without
 743   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 744   // we detect and recover from the race that the 1-0 exit admits.
 745   //
 746   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 747   // before it STs null into _owner, releasing the lock.  Updates
 748   // to data protected by the critical section must be visible before
 749   // we drop the lock (and thus before any other thread could acquire
 750   // the lock and observe the fields protected by the lock).
 751   // IA32's memory-model is SPO, so STs are ordered with respect to
 752   // each other and there's no need for an explicit barrier (fence).
 753   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 754 #ifndef _LP64
 755   get_thread (boxReg);
 756 
 757   // Note that we could employ various encoding schemes to reduce
 758   // the number of loads below (currently 4) to just 2 or 3.
 759   // Refer to the comments in synchronizer.cpp.
 760   // In practice the chain of fetches doesn't seem to impact performance, however.
 761   xorptr(boxReg, boxReg);
 762   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 763   jccb  (Assembler::notZero, DONE_LABEL);
 764   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 765   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 766   jccb  (Assembler::notZero, CheckSucc);
 767   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 768   jmpb  (DONE_LABEL);
 769 
 770   bind (Stacked);
 771   if (UseFastLocking) {
 772     mov(boxReg, tmpReg);
 773     fast_unlock_impl(objReg, boxReg, tmpReg, DONE_LABEL);
 774     xorl(tmpReg, tmpReg);
 775   } else {
 776     // It's not inflated and it's not recursively stack-locked and it's not biased.
 777     // It must be stack-locked.
 778     // Try to reset the header to displaced header.
 779     // The "box" value on the stack is stable, so we can reload
 780     // and be assured we observe the same value as above.
 781     movptr(tmpReg, Address(boxReg, 0));
 782     lock();
 783     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 784   }
 785   // Intention fall-thru into DONE_LABEL
 786 
 787   // DONE_LABEL is a hot target - we'd really like to place it at the
 788   // start of cache line by padding with NOPs.
 789   // See the AMD and Intel software optimization manuals for the
 790   // most efficient "long" NOP encodings.
 791   // Unfortunately none of our alignment mechanisms suffice.
 792   bind (CheckSucc);
 793 #else // _LP64
 794   // It's inflated
 795   Label LNotRecursive, LSuccess, LGoSlowPath;
 796 
 797   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 798   jccb(Assembler::equal, LNotRecursive);
 799 
 800   // Recursive inflated unlock
 801   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 802   jmpb(LSuccess);
 803 
 804   bind(LNotRecursive);
 805   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 806   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 807   jccb  (Assembler::notZero, CheckSucc);
 808   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 809   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 810   jmpb  (DONE_LABEL);
 811 
 812   // Try to avoid passing control into the slow_path ...
 813   bind  (CheckSucc);
 814 
 815   // The following optional optimization can be elided if necessary
 816   // Effectively: if (succ == null) goto slow path
 817   // The code reduces the window for a race, however,
 818   // and thus benefits performance.
 819   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 820   jccb  (Assembler::zero, LGoSlowPath);
 821 
 822   xorptr(boxReg, boxReg);
 823   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 824   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 825 
 826   // Memory barrier/fence
 827   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 828   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 829   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 830   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 831   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 832   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 833   lock(); addl(Address(rsp, 0), 0);
 834 
 835   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 836   jccb  (Assembler::notZero, LSuccess);
 837 
 838   // Rare inopportune interleaving - race.
 839   // The successor vanished in the small window above.
 840   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 841   // We need to ensure progress and succession.
 842   // Try to reacquire the lock.
 843   // If that fails then the new owner is responsible for succession and this
 844   // thread needs to take no further action and can exit via the fast path (success).
 845   // If the re-acquire succeeds then pass control into the slow path.
 846   // As implemented, this latter mode is horrible because we generated more
 847   // coherence traffic on the lock *and* artifically extended the critical section
 848   // length while by virtue of passing control into the slow path.
 849 
 850   // box is really RAX -- the following CMPXCHG depends on that binding
 851   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 852   lock();
 853   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 854   // There's no successor so we tried to regrab the lock.
 855   // If that didn't work, then another thread grabbed the
 856   // lock so we're done (and exit was a success).
 857   jccb  (Assembler::notEqual, LSuccess);
 858   // Intentional fall-through into slow path
 859 
 860   bind  (LGoSlowPath);
 861   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 862   jmpb  (DONE_LABEL);
 863 
 864   bind  (LSuccess);
 865   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 866   jmpb  (DONE_LABEL);
 867 
 868   bind  (Stacked);
 869 
 870   if (UseFastLocking) {
 871     mov(boxReg, tmpReg);
 872     fast_unlock_impl(objReg, boxReg, tmpReg, DONE_LABEL);
 873     xorl(tmpReg, tmpReg);
 874   } else {
 875     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 876     lock();
 877     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 878   }
 879 
 880 #endif
 881   bind(DONE_LABEL);
 882 }
 883 
 884 //-------------------------------------------------------------------------------------------
 885 // Generic instructions support for use in .ad files C2 code generation
 886 
 887 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
 888   if (dst != src) {
 889     movdqu(dst, src);
 890   }
 891   if (opcode == Op_AbsVD) {
 892     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr);
 893   } else {
 894     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 895     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr);
 896   }
 897 }
 898 
 899 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
 900   if (opcode == Op_AbsVD) {
 901     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr);
 902   } else {
 903     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 904     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr);
 905   }
 906 }
 907 
 908 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
 909   if (dst != src) {
 910     movdqu(dst, src);
 911   }
 912   if (opcode == Op_AbsVF) {
 913     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);
 914   } else {
 915     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 916     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);
 917   }
 918 }
 919 
 920 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
 921   if (opcode == Op_AbsVF) {
 922     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);
 923   } else {
 924     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 925     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
 926   }
 927 }
 928 
 929 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 930   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 931   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 932 
 933   if (opcode == Op_MinV) {
 934     if (elem_bt == T_BYTE) {
 935       pminsb(dst, src);
 936     } else if (elem_bt == T_SHORT) {
 937       pminsw(dst, src);
 938     } else if (elem_bt == T_INT) {
 939       pminsd(dst, src);
 940     } else {
 941       assert(elem_bt == T_LONG, "required");
 942       assert(tmp == xmm0, "required");
 943       assert_different_registers(dst, src, tmp);
 944       movdqu(xmm0, dst);
 945       pcmpgtq(xmm0, src);
 946       blendvpd(dst, src);  // xmm0 as mask
 947     }
 948   } else { // opcode == Op_MaxV
 949     if (elem_bt == T_BYTE) {
 950       pmaxsb(dst, src);
 951     } else if (elem_bt == T_SHORT) {
 952       pmaxsw(dst, src);
 953     } else if (elem_bt == T_INT) {
 954       pmaxsd(dst, src);
 955     } else {
 956       assert(elem_bt == T_LONG, "required");
 957       assert(tmp == xmm0, "required");
 958       assert_different_registers(dst, src, tmp);
 959       movdqu(xmm0, src);
 960       pcmpgtq(xmm0, dst);
 961       blendvpd(dst, src);  // xmm0 as mask
 962     }
 963   }
 964 }
 965 
 966 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 967                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 968                                  int vlen_enc) {
 969   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 970 
 971   if (opcode == Op_MinV) {
 972     if (elem_bt == T_BYTE) {
 973       vpminsb(dst, src1, src2, vlen_enc);
 974     } else if (elem_bt == T_SHORT) {
 975       vpminsw(dst, src1, src2, vlen_enc);
 976     } else if (elem_bt == T_INT) {
 977       vpminsd(dst, src1, src2, vlen_enc);
 978     } else {
 979       assert(elem_bt == T_LONG, "required");
 980       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 981         vpminsq(dst, src1, src2, vlen_enc);
 982       } else {
 983         assert_different_registers(dst, src1, src2);
 984         vpcmpgtq(dst, src1, src2, vlen_enc);
 985         vblendvpd(dst, src1, src2, dst, vlen_enc);
 986       }
 987     }
 988   } else { // opcode == Op_MaxV
 989     if (elem_bt == T_BYTE) {
 990       vpmaxsb(dst, src1, src2, vlen_enc);
 991     } else if (elem_bt == T_SHORT) {
 992       vpmaxsw(dst, src1, src2, vlen_enc);
 993     } else if (elem_bt == T_INT) {
 994       vpmaxsd(dst, src1, src2, vlen_enc);
 995     } else {
 996       assert(elem_bt == T_LONG, "required");
 997       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 998         vpmaxsq(dst, src1, src2, vlen_enc);
 999       } else {
1000         assert_different_registers(dst, src1, src2);
1001         vpcmpgtq(dst, src1, src2, vlen_enc);
1002         vblendvpd(dst, src2, src1, dst, vlen_enc);
1003       }
1004     }
1005   }
1006 }
1007 
1008 // Float/Double min max
1009 
1010 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1011                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1012                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1013                                    int vlen_enc) {
1014   assert(UseAVX > 0, "required");
1015   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1016          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1017   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1018   assert_different_registers(a, b, tmp, atmp, btmp);
1019 
1020   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1021   bool is_double_word = is_double_word_type(elem_bt);
1022 
1023   if (!is_double_word && is_min) {
1024     vblendvps(atmp, a, b, a, vlen_enc);
1025     vblendvps(btmp, b, a, a, vlen_enc);
1026     vminps(tmp, atmp, btmp, vlen_enc);
1027     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1028     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
1029   } else if (!is_double_word && !is_min) {
1030     vblendvps(btmp, b, a, b, vlen_enc);
1031     vblendvps(atmp, a, b, b, vlen_enc);
1032     vmaxps(tmp, atmp, btmp, vlen_enc);
1033     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1034     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
1035   } else if (is_double_word && is_min) {
1036     vblendvpd(atmp, a, b, a, vlen_enc);
1037     vblendvpd(btmp, b, a, a, vlen_enc);
1038     vminpd(tmp, atmp, btmp, vlen_enc);
1039     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1040     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1041   } else {
1042     assert(is_double_word && !is_min, "sanity");
1043     vblendvpd(btmp, b, a, b, vlen_enc);
1044     vblendvpd(atmp, a, b, b, vlen_enc);
1045     vmaxpd(tmp, atmp, btmp, vlen_enc);
1046     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1047     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1048   }
1049 }
1050 
1051 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1052                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1053                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1054                                     int vlen_enc) {
1055   assert(UseAVX > 2, "required");
1056   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1057          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1058   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1059   assert_different_registers(dst, a, b, atmp, btmp);
1060 
1061   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1062   bool is_double_word = is_double_word_type(elem_bt);
1063   bool merge = true;
1064 
1065   if (!is_double_word && is_min) {
1066     evpmovd2m(ktmp, a, vlen_enc);
1067     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1068     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1069     vminps(dst, atmp, btmp, vlen_enc);
1070     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1071     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1072   } else if (!is_double_word && !is_min) {
1073     evpmovd2m(ktmp, b, vlen_enc);
1074     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1075     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1076     vmaxps(dst, atmp, btmp, vlen_enc);
1077     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1078     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1079   } else if (is_double_word && is_min) {
1080     evpmovq2m(ktmp, a, vlen_enc);
1081     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1082     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1083     vminpd(dst, atmp, btmp, vlen_enc);
1084     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1085     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1086   } else {
1087     assert(is_double_word && !is_min, "sanity");
1088     evpmovq2m(ktmp, b, vlen_enc);
1089     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1090     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1091     vmaxpd(dst, atmp, btmp, vlen_enc);
1092     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1093     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1094   }
1095 }
1096 
1097 // Float/Double signum
1098 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst,
1099                                   XMMRegister zero, XMMRegister one,
1100                                   Register scratch) {
1101   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1102 
1103   Label DONE_LABEL;
1104 
1105   if (opcode == Op_SignumF) {
1106     assert(UseSSE > 0, "required");
1107     ucomiss(dst, zero);
1108     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1109     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1110     movflt(dst, one);
1111     jcc(Assembler::above, DONE_LABEL);
1112     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scratch);
1113   } else if (opcode == Op_SignumD) {
1114     assert(UseSSE > 1, "required");
1115     ucomisd(dst, zero);
1116     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1117     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1118     movdbl(dst, one);
1119     jcc(Assembler::above, DONE_LABEL);
1120     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scratch);
1121   }
1122 
1123   bind(DONE_LABEL);
1124 }
1125 
1126 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1127   if (sign) {
1128     pmovsxbw(dst, src);
1129   } else {
1130     pmovzxbw(dst, src);
1131   }
1132 }
1133 
1134 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1135   if (sign) {
1136     vpmovsxbw(dst, src, vector_len);
1137   } else {
1138     vpmovzxbw(dst, src, vector_len);
1139   }
1140 }
1141 
1142 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1143   if (sign) {
1144     vpmovsxbd(dst, src, vector_len);
1145   } else {
1146     vpmovzxbd(dst, src, vector_len);
1147   }
1148 }
1149 
1150 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1151   if (sign) {
1152     vpmovsxwd(dst, src, vector_len);
1153   } else {
1154     vpmovzxwd(dst, src, vector_len);
1155   }
1156 }
1157 
1158 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1159                                      int shift, int vector_len) {
1160   if (opcode == Op_RotateLeftV) {
1161     if (etype == T_INT) {
1162       evprold(dst, src, shift, vector_len);
1163     } else {
1164       assert(etype == T_LONG, "expected type T_LONG");
1165       evprolq(dst, src, shift, vector_len);
1166     }
1167   } else {
1168     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1169     if (etype == T_INT) {
1170       evprord(dst, src, shift, vector_len);
1171     } else {
1172       assert(etype == T_LONG, "expected type T_LONG");
1173       evprorq(dst, src, shift, vector_len);
1174     }
1175   }
1176 }
1177 
1178 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1179                                      XMMRegister shift, int vector_len) {
1180   if (opcode == Op_RotateLeftV) {
1181     if (etype == T_INT) {
1182       evprolvd(dst, src, shift, vector_len);
1183     } else {
1184       assert(etype == T_LONG, "expected type T_LONG");
1185       evprolvq(dst, src, shift, vector_len);
1186     }
1187   } else {
1188     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1189     if (etype == T_INT) {
1190       evprorvd(dst, src, shift, vector_len);
1191     } else {
1192       assert(etype == T_LONG, "expected type T_LONG");
1193       evprorvq(dst, src, shift, vector_len);
1194     }
1195   }
1196 }
1197 
1198 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1199   if (opcode == Op_RShiftVI) {
1200     psrad(dst, shift);
1201   } else if (opcode == Op_LShiftVI) {
1202     pslld(dst, shift);
1203   } else {
1204     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1205     psrld(dst, shift);
1206   }
1207 }
1208 
1209 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1210   switch (opcode) {
1211     case Op_RShiftVI:  psrad(dst, shift); break;
1212     case Op_LShiftVI:  pslld(dst, shift); break;
1213     case Op_URShiftVI: psrld(dst, shift); break;
1214 
1215     default: assert(false, "%s", NodeClassNames[opcode]);
1216   }
1217 }
1218 
1219 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1220   if (opcode == Op_RShiftVI) {
1221     vpsrad(dst, nds, shift, vector_len);
1222   } else if (opcode == Op_LShiftVI) {
1223     vpslld(dst, nds, shift, vector_len);
1224   } else {
1225     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1226     vpsrld(dst, nds, shift, vector_len);
1227   }
1228 }
1229 
1230 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1231   switch (opcode) {
1232     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1233     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1234     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1235 
1236     default: assert(false, "%s", NodeClassNames[opcode]);
1237   }
1238 }
1239 
1240 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1241   switch (opcode) {
1242     case Op_RShiftVB:  // fall-through
1243     case Op_RShiftVS:  psraw(dst, shift); break;
1244 
1245     case Op_LShiftVB:  // fall-through
1246     case Op_LShiftVS:  psllw(dst, shift);   break;
1247 
1248     case Op_URShiftVS: // fall-through
1249     case Op_URShiftVB: psrlw(dst, shift);  break;
1250 
1251     default: assert(false, "%s", NodeClassNames[opcode]);
1252   }
1253 }
1254 
1255 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1256   switch (opcode) {
1257     case Op_RShiftVB:  // fall-through
1258     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1259 
1260     case Op_LShiftVB:  // fall-through
1261     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1262 
1263     case Op_URShiftVS: // fall-through
1264     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1265 
1266     default: assert(false, "%s", NodeClassNames[opcode]);
1267   }
1268 }
1269 
1270 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1271   switch (opcode) {
1272     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1273     case Op_LShiftVL:  psllq(dst, shift); break;
1274     case Op_URShiftVL: psrlq(dst, shift); break;
1275 
1276     default: assert(false, "%s", NodeClassNames[opcode]);
1277   }
1278 }
1279 
1280 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1281   if (opcode == Op_RShiftVL) {
1282     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1283   } else if (opcode == Op_LShiftVL) {
1284     psllq(dst, shift);
1285   } else {
1286     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1287     psrlq(dst, shift);
1288   }
1289 }
1290 
1291 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1292   switch (opcode) {
1293     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1294     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1295     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1296 
1297     default: assert(false, "%s", NodeClassNames[opcode]);
1298   }
1299 }
1300 
1301 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1302   if (opcode == Op_RShiftVL) {
1303     evpsraq(dst, nds, shift, vector_len);
1304   } else if (opcode == Op_LShiftVL) {
1305     vpsllq(dst, nds, shift, vector_len);
1306   } else {
1307     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1308     vpsrlq(dst, nds, shift, vector_len);
1309   }
1310 }
1311 
1312 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1313   switch (opcode) {
1314     case Op_RShiftVB:  // fall-through
1315     case Op_RShiftVS:  // fall-through
1316     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1317 
1318     case Op_LShiftVB:  // fall-through
1319     case Op_LShiftVS:  // fall-through
1320     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1321 
1322     case Op_URShiftVB: // fall-through
1323     case Op_URShiftVS: // fall-through
1324     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1325 
1326     default: assert(false, "%s", NodeClassNames[opcode]);
1327   }
1328 }
1329 
1330 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1331   switch (opcode) {
1332     case Op_RShiftVB:  // fall-through
1333     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1334 
1335     case Op_LShiftVB:  // fall-through
1336     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1337 
1338     case Op_URShiftVB: // fall-through
1339     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1340 
1341     default: assert(false, "%s", NodeClassNames[opcode]);
1342   }
1343 }
1344 
1345 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1346   assert(UseAVX >= 2, "required");
1347   switch (opcode) {
1348     case Op_RShiftVL: {
1349       if (UseAVX > 2) {
1350         assert(tmp == xnoreg, "not used");
1351         if (!VM_Version::supports_avx512vl()) {
1352           vlen_enc = Assembler::AVX_512bit;
1353         }
1354         evpsravq(dst, src, shift, vlen_enc);
1355       } else {
1356         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1357         vpsrlvq(dst, src, shift, vlen_enc);
1358         vpsrlvq(tmp, tmp, shift, vlen_enc);
1359         vpxor(dst, dst, tmp, vlen_enc);
1360         vpsubq(dst, dst, tmp, vlen_enc);
1361       }
1362       break;
1363     }
1364     case Op_LShiftVL: {
1365       assert(tmp == xnoreg, "not used");
1366       vpsllvq(dst, src, shift, vlen_enc);
1367       break;
1368     }
1369     case Op_URShiftVL: {
1370       assert(tmp == xnoreg, "not used");
1371       vpsrlvq(dst, src, shift, vlen_enc);
1372       break;
1373     }
1374     default: assert(false, "%s", NodeClassNames[opcode]);
1375   }
1376 }
1377 
1378 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1379 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1380   assert(opcode == Op_LShiftVB ||
1381          opcode == Op_RShiftVB ||
1382          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1383   bool sign = (opcode != Op_URShiftVB);
1384   assert(vector_len == 0, "required");
1385   vextendbd(sign, dst, src, 1);
1386   vpmovzxbd(vtmp, shift, 1);
1387   varshiftd(opcode, dst, dst, vtmp, 1);
1388   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch);
1389   vextracti128_high(vtmp, dst);
1390   vpackusdw(dst, dst, vtmp, 0);
1391 }
1392 
1393 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1394 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1395   assert(opcode == Op_LShiftVB ||
1396          opcode == Op_RShiftVB ||
1397          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1398   bool sign = (opcode != Op_URShiftVB);
1399   int ext_vector_len = vector_len + 1;
1400   vextendbw(sign, dst, src, ext_vector_len);
1401   vpmovzxbw(vtmp, shift, ext_vector_len);
1402   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1403   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch);
1404   if (vector_len == 0) {
1405     vextracti128_high(vtmp, dst);
1406     vpackuswb(dst, dst, vtmp, vector_len);
1407   } else {
1408     vextracti64x4_high(vtmp, dst);
1409     vpackuswb(dst, dst, vtmp, vector_len);
1410     vpermq(dst, dst, 0xD8, vector_len);
1411   }
1412 }
1413 
1414 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1415   switch(typ) {
1416     case T_BYTE:
1417       pinsrb(dst, val, idx);
1418       break;
1419     case T_SHORT:
1420       pinsrw(dst, val, idx);
1421       break;
1422     case T_INT:
1423       pinsrd(dst, val, idx);
1424       break;
1425     case T_LONG:
1426       pinsrq(dst, val, idx);
1427       break;
1428     default:
1429       assert(false,"Should not reach here.");
1430       break;
1431   }
1432 }
1433 
1434 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1435   switch(typ) {
1436     case T_BYTE:
1437       vpinsrb(dst, src, val, idx);
1438       break;
1439     case T_SHORT:
1440       vpinsrw(dst, src, val, idx);
1441       break;
1442     case T_INT:
1443       vpinsrd(dst, src, val, idx);
1444       break;
1445     case T_LONG:
1446       vpinsrq(dst, src, val, idx);
1447       break;
1448     default:
1449       assert(false,"Should not reach here.");
1450       break;
1451   }
1452 }
1453 
1454 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1455   switch(typ) {
1456     case T_INT:
1457       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1458       break;
1459     case T_FLOAT:
1460       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1461       break;
1462     case T_LONG:
1463       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1464       break;
1465     case T_DOUBLE:
1466       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1467       break;
1468     default:
1469       assert(false,"Should not reach here.");
1470       break;
1471   }
1472 }
1473 
1474 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1475   switch(typ) {
1476     case T_INT:
1477       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1478       break;
1479     case T_FLOAT:
1480       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1481       break;
1482     case T_LONG:
1483       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1484       break;
1485     case T_DOUBLE:
1486       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1487       break;
1488     default:
1489       assert(false,"Should not reach here.");
1490       break;
1491   }
1492 }
1493 
1494 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1495   switch(typ) {
1496     case T_INT:
1497       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1498       break;
1499     case T_FLOAT:
1500       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1501       break;
1502     case T_LONG:
1503       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1504       break;
1505     case T_DOUBLE:
1506       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1507       break;
1508     default:
1509       assert(false,"Should not reach here.");
1510       break;
1511   }
1512 }
1513 
1514 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1515   if (vlen_in_bytes <= 16) {
1516     pxor (dst, dst);
1517     psubb(dst, src);
1518     switch (elem_bt) {
1519       case T_BYTE:   /* nothing to do */ break;
1520       case T_SHORT:  pmovsxbw(dst, dst); break;
1521       case T_INT:    pmovsxbd(dst, dst); break;
1522       case T_FLOAT:  pmovsxbd(dst, dst); break;
1523       case T_LONG:   pmovsxbq(dst, dst); break;
1524       case T_DOUBLE: pmovsxbq(dst, dst); break;
1525 
1526       default: assert(false, "%s", type2name(elem_bt));
1527     }
1528   } else {
1529     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1530     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1531 
1532     vpxor (dst, dst, dst, vlen_enc);
1533     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1534 
1535     switch (elem_bt) {
1536       case T_BYTE:   /* nothing to do */            break;
1537       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1538       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1539       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1540       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1541       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1542 
1543       default: assert(false, "%s", type2name(elem_bt));
1544     }
1545   }
1546 }
1547 
1548 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) {
1549   ExternalAddress addr(StubRoutines::x86::vector_iota_indices());
1550   if (vlen_in_bytes == 4) {
1551     movdl(dst, addr);
1552   } else if (vlen_in_bytes == 8) {
1553     movq(dst, addr);
1554   } else if (vlen_in_bytes == 16) {
1555     movdqu(dst, addr, scratch);
1556   } else if (vlen_in_bytes == 32) {
1557     vmovdqu(dst, addr, scratch);
1558   } else {
1559     assert(vlen_in_bytes == 64, "%d", vlen_in_bytes);
1560     evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch);
1561   }
1562 }
1563 
1564 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1565 
1566 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1567   int vector_len = Assembler::AVX_128bit;
1568 
1569   switch (opcode) {
1570     case Op_AndReductionV:  pand(dst, src); break;
1571     case Op_OrReductionV:   por (dst, src); break;
1572     case Op_XorReductionV:  pxor(dst, src); break;
1573     case Op_MinReductionV:
1574       switch (typ) {
1575         case T_BYTE:        pminsb(dst, src); break;
1576         case T_SHORT:       pminsw(dst, src); break;
1577         case T_INT:         pminsd(dst, src); break;
1578         case T_LONG:        assert(UseAVX > 2, "required");
1579                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1580         default:            assert(false, "wrong type");
1581       }
1582       break;
1583     case Op_MaxReductionV:
1584       switch (typ) {
1585         case T_BYTE:        pmaxsb(dst, src); break;
1586         case T_SHORT:       pmaxsw(dst, src); break;
1587         case T_INT:         pmaxsd(dst, src); break;
1588         case T_LONG:        assert(UseAVX > 2, "required");
1589                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1590         default:            assert(false, "wrong type");
1591       }
1592       break;
1593     case Op_AddReductionVF: addss(dst, src); break;
1594     case Op_AddReductionVD: addsd(dst, src); break;
1595     case Op_AddReductionVI:
1596       switch (typ) {
1597         case T_BYTE:        paddb(dst, src); break;
1598         case T_SHORT:       paddw(dst, src); break;
1599         case T_INT:         paddd(dst, src); break;
1600         default:            assert(false, "wrong type");
1601       }
1602       break;
1603     case Op_AddReductionVL: paddq(dst, src); break;
1604     case Op_MulReductionVF: mulss(dst, src); break;
1605     case Op_MulReductionVD: mulsd(dst, src); break;
1606     case Op_MulReductionVI:
1607       switch (typ) {
1608         case T_SHORT:       pmullw(dst, src); break;
1609         case T_INT:         pmulld(dst, src); break;
1610         default:            assert(false, "wrong type");
1611       }
1612       break;
1613     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1614                             vpmullq(dst, dst, src, vector_len); break;
1615     default:                assert(false, "wrong opcode");
1616   }
1617 }
1618 
1619 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1620   int vector_len = Assembler::AVX_256bit;
1621 
1622   switch (opcode) {
1623     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1624     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1625     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1626     case Op_MinReductionV:
1627       switch (typ) {
1628         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1629         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1630         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1631         case T_LONG:        assert(UseAVX > 2, "required");
1632                             vpminsq(dst, src1, src2, vector_len); break;
1633         default:            assert(false, "wrong type");
1634       }
1635       break;
1636     case Op_MaxReductionV:
1637       switch (typ) {
1638         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1639         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1640         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1641         case T_LONG:        assert(UseAVX > 2, "required");
1642                             vpmaxsq(dst, src1, src2, vector_len); break;
1643         default:            assert(false, "wrong type");
1644       }
1645       break;
1646     case Op_AddReductionVI:
1647       switch (typ) {
1648         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1649         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1650         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1651         default:            assert(false, "wrong type");
1652       }
1653       break;
1654     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1655     case Op_MulReductionVI:
1656       switch (typ) {
1657         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1658         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1659         default:            assert(false, "wrong type");
1660       }
1661       break;
1662     case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
1663     default:                assert(false, "wrong opcode");
1664   }
1665 }
1666 
1667 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1668                                   XMMRegister dst, XMMRegister src,
1669                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1670   switch (opcode) {
1671     case Op_AddReductionVF:
1672     case Op_MulReductionVF:
1673       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1674       break;
1675 
1676     case Op_AddReductionVD:
1677     case Op_MulReductionVD:
1678       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1679       break;
1680 
1681     default: assert(false, "wrong opcode");
1682   }
1683 }
1684 
1685 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1686                              Register dst, Register src1, XMMRegister src2,
1687                              XMMRegister vtmp1, XMMRegister vtmp2) {
1688   switch (vlen) {
1689     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1690     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1691     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1692     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1693 
1694     default: assert(false, "wrong vector length");
1695   }
1696 }
1697 
1698 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1699                              Register dst, Register src1, XMMRegister src2,
1700                              XMMRegister vtmp1, XMMRegister vtmp2) {
1701   switch (vlen) {
1702     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1703     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1704     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1705     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1706 
1707     default: assert(false, "wrong vector length");
1708   }
1709 }
1710 
1711 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1712                              Register dst, Register src1, XMMRegister src2,
1713                              XMMRegister vtmp1, XMMRegister vtmp2) {
1714   switch (vlen) {
1715     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1716     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1717     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1718     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1719 
1720     default: assert(false, "wrong vector length");
1721   }
1722 }
1723 
1724 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1725                              Register dst, Register src1, XMMRegister src2,
1726                              XMMRegister vtmp1, XMMRegister vtmp2) {
1727   switch (vlen) {
1728     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1729     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1730     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1731     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1732 
1733     default: assert(false, "wrong vector length");
1734   }
1735 }
1736 
1737 #ifdef _LP64
1738 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1739                              Register dst, Register src1, XMMRegister src2,
1740                              XMMRegister vtmp1, XMMRegister vtmp2) {
1741   switch (vlen) {
1742     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1743     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1744     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1745 
1746     default: assert(false, "wrong vector length");
1747   }
1748 }
1749 #endif // _LP64
1750 
1751 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1752   switch (vlen) {
1753     case 2:
1754       assert(vtmp2 == xnoreg, "");
1755       reduce2F(opcode, dst, src, vtmp1);
1756       break;
1757     case 4:
1758       assert(vtmp2 == xnoreg, "");
1759       reduce4F(opcode, dst, src, vtmp1);
1760       break;
1761     case 8:
1762       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1763       break;
1764     case 16:
1765       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1766       break;
1767     default: assert(false, "wrong vector length");
1768   }
1769 }
1770 
1771 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1772   switch (vlen) {
1773     case 2:
1774       assert(vtmp2 == xnoreg, "");
1775       reduce2D(opcode, dst, src, vtmp1);
1776       break;
1777     case 4:
1778       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1779       break;
1780     case 8:
1781       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1782       break;
1783     default: assert(false, "wrong vector length");
1784   }
1785 }
1786 
1787 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1788   if (opcode == Op_AddReductionVI) {
1789     if (vtmp1 != src2) {
1790       movdqu(vtmp1, src2);
1791     }
1792     phaddd(vtmp1, vtmp1);
1793   } else {
1794     pshufd(vtmp1, src2, 0x1);
1795     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1796   }
1797   movdl(vtmp2, src1);
1798   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1799   movdl(dst, vtmp1);
1800 }
1801 
1802 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1803   if (opcode == Op_AddReductionVI) {
1804     if (vtmp1 != src2) {
1805       movdqu(vtmp1, src2);
1806     }
1807     phaddd(vtmp1, src2);
1808     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1809   } else {
1810     pshufd(vtmp2, src2, 0xE);
1811     reduce_operation_128(T_INT, opcode, vtmp2, src2);
1812     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1813   }
1814 }
1815 
1816 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1817   if (opcode == Op_AddReductionVI) {
1818     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1819     vextracti128_high(vtmp2, vtmp1);
1820     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1821     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1822   } else {
1823     vextracti128_high(vtmp1, src2);
1824     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1825     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1826   }
1827 }
1828 
1829 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1830   vextracti64x4_high(vtmp2, src2);
1831   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
1832   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1833 }
1834 
1835 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1836   pshufd(vtmp2, src2, 0x1);
1837   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1838   movdqu(vtmp1, vtmp2);
1839   psrldq(vtmp1, 2);
1840   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1841   movdqu(vtmp2, vtmp1);
1842   psrldq(vtmp2, 1);
1843   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1844   movdl(vtmp2, src1);
1845   pmovsxbd(vtmp1, vtmp1);
1846   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1847   pextrb(dst, vtmp1, 0x0);
1848   movsbl(dst, dst);
1849 }
1850 
1851 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1852   pshufd(vtmp1, src2, 0xE);
1853   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
1854   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1855 }
1856 
1857 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1858   vextracti128_high(vtmp2, src2);
1859   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1860   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1861 }
1862 
1863 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1864   vextracti64x4_high(vtmp1, src2);
1865   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
1866   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1867 }
1868 
1869 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1870   pmovsxbw(vtmp2, src2);
1871   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1872 }
1873 
1874 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1875   if (UseAVX > 1) {
1876     int vector_len = Assembler::AVX_256bit;
1877     vpmovsxbw(vtmp1, src2, vector_len);
1878     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1879   } else {
1880     pmovsxbw(vtmp2, src2);
1881     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1882     pshufd(vtmp2, src2, 0x1);
1883     pmovsxbw(vtmp2, src2);
1884     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1885   }
1886 }
1887 
1888 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1889   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
1890     int vector_len = Assembler::AVX_512bit;
1891     vpmovsxbw(vtmp1, src2, vector_len);
1892     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1893   } else {
1894     assert(UseAVX >= 2,"Should not reach here.");
1895     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
1896     vextracti128_high(vtmp2, src2);
1897     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1898   }
1899 }
1900 
1901 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1902   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
1903   vextracti64x4_high(vtmp2, src2);
1904   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1905 }
1906 
1907 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1908   if (opcode == Op_AddReductionVI) {
1909     if (vtmp1 != src2) {
1910       movdqu(vtmp1, src2);
1911     }
1912     phaddw(vtmp1, vtmp1);
1913     phaddw(vtmp1, vtmp1);
1914   } else {
1915     pshufd(vtmp2, src2, 0x1);
1916     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1917     movdqu(vtmp1, vtmp2);
1918     psrldq(vtmp1, 2);
1919     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
1920   }
1921   movdl(vtmp2, src1);
1922   pmovsxwd(vtmp1, vtmp1);
1923   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1924   pextrw(dst, vtmp1, 0x0);
1925   movswl(dst, dst);
1926 }
1927 
1928 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1929   if (opcode == Op_AddReductionVI) {
1930     if (vtmp1 != src2) {
1931       movdqu(vtmp1, src2);
1932     }
1933     phaddw(vtmp1, src2);
1934   } else {
1935     pshufd(vtmp1, src2, 0xE);
1936     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
1937   }
1938   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1939 }
1940 
1941 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1942   if (opcode == Op_AddReductionVI) {
1943     int vector_len = Assembler::AVX_256bit;
1944     vphaddw(vtmp2, src2, src2, vector_len);
1945     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
1946   } else {
1947     vextracti128_high(vtmp2, src2);
1948     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1949   }
1950   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1951 }
1952 
1953 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1954   int vector_len = Assembler::AVX_256bit;
1955   vextracti64x4_high(vtmp1, src2);
1956   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
1957   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1958 }
1959 
1960 #ifdef _LP64
1961 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1962   pshufd(vtmp2, src2, 0xE);
1963   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
1964   movdq(vtmp1, src1);
1965   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
1966   movdq(dst, vtmp1);
1967 }
1968 
1969 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1970   vextracti128_high(vtmp1, src2);
1971   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
1972   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1973 }
1974 
1975 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1976   vextracti64x4_high(vtmp2, src2);
1977   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
1978   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1979 }
1980 
1981 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
1982   assert(ArrayOperationPartialInlineSize > 0 && ArrayOperationPartialInlineSize <= 64, "invalid");
1983   mov64(temp, -1L);
1984   bzhiq(temp, temp, len);
1985   kmovql(dst, temp);
1986 }
1987 #endif // _LP64
1988 
1989 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1990   reduce_operation_128(T_FLOAT, opcode, dst, src);
1991   pshufd(vtmp, src, 0x1);
1992   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1993 }
1994 
1995 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1996   reduce2F(opcode, dst, src, vtmp);
1997   pshufd(vtmp, src, 0x2);
1998   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1999   pshufd(vtmp, src, 0x3);
2000   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2001 }
2002 
2003 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2004   reduce4F(opcode, dst, src, vtmp2);
2005   vextractf128_high(vtmp2, src);
2006   reduce4F(opcode, dst, vtmp2, vtmp1);
2007 }
2008 
2009 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2010   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2011   vextracti64x4_high(vtmp1, src);
2012   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2013 }
2014 
2015 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2016   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2017   pshufd(vtmp, src, 0xE);
2018   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2019 }
2020 
2021 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2022   reduce2D(opcode, dst, src, vtmp2);
2023   vextractf128_high(vtmp2, src);
2024   reduce2D(opcode, dst, vtmp2, vtmp1);
2025 }
2026 
2027 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2028   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2029   vextracti64x4_high(vtmp1, src);
2030   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2031 }
2032 
2033 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) {
2034   MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
2035 }
2036 
2037 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) {
2038   MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
2039 }
2040 
2041 
2042 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2043                                           XMMRegister dst, XMMRegister src,
2044                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2045                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2046   int permconst[] = {1, 14};
2047   XMMRegister wsrc = src;
2048   XMMRegister wdst = xmm_0;
2049   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2050 
2051   int vlen_enc = Assembler::AVX_128bit;
2052   if (vlen == 16) {
2053     vlen_enc = Assembler::AVX_256bit;
2054   }
2055 
2056   for (int i = log2(vlen) - 1; i >=0; i--) {
2057     if (i == 0 && !is_dst_valid) {
2058       wdst = dst;
2059     }
2060     if (i == 3) {
2061       vextracti64x4_high(wtmp, wsrc);
2062     } else if (i == 2) {
2063       vextracti128_high(wtmp, wsrc);
2064     } else { // i = [0,1]
2065       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2066     }
2067     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2068     wsrc = wdst;
2069     vlen_enc = Assembler::AVX_128bit;
2070   }
2071   if (is_dst_valid) {
2072     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2073   }
2074 }
2075 
2076 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2077                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2078                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2079   XMMRegister wsrc = src;
2080   XMMRegister wdst = xmm_0;
2081   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2082   int vlen_enc = Assembler::AVX_128bit;
2083   if (vlen == 8) {
2084     vlen_enc = Assembler::AVX_256bit;
2085   }
2086   for (int i = log2(vlen) - 1; i >=0; i--) {
2087     if (i == 0 && !is_dst_valid) {
2088       wdst = dst;
2089     }
2090     if (i == 1) {
2091       vextracti128_high(wtmp, wsrc);
2092     } else if (i == 2) {
2093       vextracti64x4_high(wtmp, wsrc);
2094     } else {
2095       assert(i == 0, "%d", i);
2096       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2097     }
2098     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2099     wsrc = wdst;
2100     vlen_enc = Assembler::AVX_128bit;
2101   }
2102   if (is_dst_valid) {
2103     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2104   }
2105 }
2106 
2107 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2108   switch (bt) {
2109     case T_BYTE:  pextrb(dst, src, idx); break;
2110     case T_SHORT: pextrw(dst, src, idx); break;
2111     case T_INT:   pextrd(dst, src, idx); break;
2112     case T_LONG:  pextrq(dst, src, idx); break;
2113 
2114     default:
2115       assert(false,"Should not reach here.");
2116       break;
2117   }
2118 }
2119 
2120 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2121   int esize =  type2aelembytes(typ);
2122   int elem_per_lane = 16/esize;
2123   int lane = elemindex / elem_per_lane;
2124   int eindex = elemindex % elem_per_lane;
2125 
2126   if (lane >= 2) {
2127     assert(UseAVX > 2, "required");
2128     vextractf32x4(dst, src, lane & 3);
2129     return dst;
2130   } else if (lane > 0) {
2131     assert(UseAVX > 0, "required");
2132     vextractf128(dst, src, lane);
2133     return dst;
2134   } else {
2135     return src;
2136   }
2137 }
2138 
2139 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2140   int esize =  type2aelembytes(typ);
2141   int elem_per_lane = 16/esize;
2142   int eindex = elemindex % elem_per_lane;
2143   assert(is_integral_type(typ),"required");
2144 
2145   if (eindex == 0) {
2146     if (typ == T_LONG) {
2147       movq(dst, src);
2148     } else {
2149       movdl(dst, src);
2150       if (typ == T_BYTE)
2151         movsbl(dst, dst);
2152       else if (typ == T_SHORT)
2153         movswl(dst, dst);
2154     }
2155   } else {
2156     extract(typ, dst, src, eindex);
2157   }
2158 }
2159 
2160 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) {
2161   int esize =  type2aelembytes(typ);
2162   int elem_per_lane = 16/esize;
2163   int eindex = elemindex % elem_per_lane;
2164   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2165 
2166   if (eindex == 0) {
2167     movq(dst, src);
2168   } else {
2169     if (typ == T_FLOAT) {
2170       if (UseAVX == 0) {
2171         movdqu(dst, src);
2172         pshufps(dst, dst, eindex);
2173       } else {
2174         vpshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2175       }
2176     } else {
2177       if (UseAVX == 0) {
2178         movdqu(dst, src);
2179         psrldq(dst, eindex*esize);
2180       } else {
2181         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2182       }
2183       movq(dst, dst);
2184     }
2185   }
2186   // Zero upper bits
2187   if (typ == T_FLOAT) {
2188     if (UseAVX == 0) {
2189       assert((vtmp != xnoreg) && (tmp != noreg), "required.");
2190       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp);
2191       pand(dst, vtmp);
2192     } else {
2193       assert((tmp != noreg), "required.");
2194       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp);
2195     }
2196   }
2197 }
2198 
2199 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2200   switch(typ) {
2201     case T_BYTE:
2202     case T_BOOLEAN:
2203       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2204       break;
2205     case T_SHORT:
2206     case T_CHAR:
2207       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2208       break;
2209     case T_INT:
2210     case T_FLOAT:
2211       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2212       break;
2213     case T_LONG:
2214     case T_DOUBLE:
2215       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2216       break;
2217     default:
2218       assert(false,"Should not reach here.");
2219       break;
2220   }
2221 }
2222 
2223 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) {
2224   switch(typ) {
2225     case T_BOOLEAN:
2226     case T_BYTE:
2227       evpcmpb(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2228       break;
2229     case T_CHAR:
2230     case T_SHORT:
2231       evpcmpw(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2232       break;
2233     case T_INT:
2234     case T_FLOAT:
2235       evpcmpd(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2236       break;
2237     case T_LONG:
2238     case T_DOUBLE:
2239       evpcmpq(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2240       break;
2241     default:
2242       assert(false,"Should not reach here.");
2243       break;
2244   }
2245 }
2246 
2247 void C2_MacroAssembler::vpcmpu(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison,
2248                             int vlen_in_bytes, XMMRegister vtmp1, XMMRegister vtmp2, Register scratch) {
2249   int vlen_enc = vector_length_encoding(vlen_in_bytes*2);
2250   switch (typ) {
2251   case T_BYTE:
2252     vpmovzxbw(vtmp1, src1, vlen_enc);
2253     vpmovzxbw(vtmp2, src2, vlen_enc);
2254     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2255     vpacksswb(dst, dst, dst, vlen_enc);
2256     break;
2257   case T_SHORT:
2258     vpmovzxwd(vtmp1, src1, vlen_enc);
2259     vpmovzxwd(vtmp2, src2, vlen_enc);
2260     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);
2261     vpackssdw(dst, dst, dst, vlen_enc);
2262     break;
2263   case T_INT:
2264     vpmovzxdq(vtmp1, src1, vlen_enc);
2265     vpmovzxdq(vtmp2, src2, vlen_enc);
2266     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);
2267     vpermilps(dst, dst, 8, vlen_enc);
2268     break;
2269   default:
2270     assert(false, "Should not reach here");
2271   }
2272   if (vlen_in_bytes == 16) {
2273     vpermpd(dst, dst, 0x8, vlen_enc);
2274   }
2275 }
2276 
2277 void C2_MacroAssembler::vpcmpu32(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison, int vlen_in_bytes,
2278                               XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, Register scratch) {
2279   int vlen_enc = vector_length_encoding(vlen_in_bytes);
2280   switch (typ) {
2281   case T_BYTE:
2282     vpmovzxbw(vtmp1, src1, vlen_enc);
2283     vpmovzxbw(vtmp2, src2, vlen_enc);
2284     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2285     vextracti128(vtmp1, src1, 1);
2286     vextracti128(vtmp2, src2, 1);
2287     vpmovzxbw(vtmp1, vtmp1, vlen_enc);
2288     vpmovzxbw(vtmp2, vtmp2, vlen_enc);
2289     vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2290     vpacksswb(dst, dst, vtmp3, vlen_enc);
2291     vpermpd(dst, dst, 0xd8, vlen_enc);
2292     break;
2293   case T_SHORT:
2294     vpmovzxwd(vtmp1, src1, vlen_enc);
2295     vpmovzxwd(vtmp2, src2, vlen_enc);
2296     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);
2297     vextracti128(vtmp1, src1, 1);
2298     vextracti128(vtmp2, src2, 1);
2299     vpmovzxwd(vtmp1, vtmp1, vlen_enc);
2300     vpmovzxwd(vtmp2, vtmp2, vlen_enc);
2301     vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::D,  vlen_enc, scratch);
2302     vpackssdw(dst, dst, vtmp3, vlen_enc);
2303     vpermpd(dst, dst, 0xd8, vlen_enc);
2304     break;
2305   case T_INT:
2306     vpmovzxdq(vtmp1, src1, vlen_enc);
2307     vpmovzxdq(vtmp2, src2, vlen_enc);
2308     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);
2309     vpshufd(dst, dst, 8, vlen_enc);
2310     vpermq(dst, dst, 8, vlen_enc);
2311     vextracti128(vtmp1, src1, 1);
2312     vextracti128(vtmp2, src2, 1);
2313     vpmovzxdq(vtmp1, vtmp1, vlen_enc);
2314     vpmovzxdq(vtmp2, vtmp2, vlen_enc);
2315     vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::Q,  vlen_enc, scratch);
2316     vpshufd(vtmp3, vtmp3, 8, vlen_enc);
2317     vpermq(vtmp3, vtmp3, 0x80, vlen_enc);
2318     vpblendd(dst, dst, vtmp3, 0xf0, vlen_enc);
2319     break;
2320   default:
2321     assert(false, "Should not reach here");
2322   }
2323 }
2324 
2325 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2326   switch(typ) {
2327     case T_BYTE:
2328       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2329       break;
2330     case T_SHORT:
2331       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2332       break;
2333     case T_INT:
2334     case T_FLOAT:
2335       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2336       break;
2337     case T_LONG:
2338     case T_DOUBLE:
2339       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2340       break;
2341     default:
2342       assert(false,"Should not reach here.");
2343       break;
2344   }
2345 }
2346 
2347 void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2,
2348                                    XMMRegister vtmp1, XMMRegister vtmp2, KRegister mask) {
2349   switch(vlen) {
2350     case 4:
2351       assert(vtmp1 != xnoreg, "required.");
2352       // Broadcast lower 32 bits to 128 bits before ptest
2353       pshufd(vtmp1, src1, 0x0);
2354       if (bt == BoolTest::overflow) {
2355         assert(vtmp2 != xnoreg, "required.");
2356         pshufd(vtmp2, src2, 0x0);
2357       } else {
2358         assert(vtmp2 == xnoreg, "required.");
2359         vtmp2 = src2;
2360       }
2361       ptest(vtmp1, vtmp2);
2362      break;
2363     case 8:
2364       assert(vtmp1 != xnoreg, "required.");
2365       // Broadcast lower 64 bits to 128 bits before ptest
2366       pshufd(vtmp1, src1, 0x4);
2367       if (bt == BoolTest::overflow) {
2368         assert(vtmp2 != xnoreg, "required.");
2369         pshufd(vtmp2, src2, 0x4);
2370       } else {
2371         assert(vtmp2 == xnoreg, "required.");
2372         vtmp2 = src2;
2373       }
2374       ptest(vtmp1, vtmp2);
2375      break;
2376     case 16:
2377       assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2378       ptest(src1, src2);
2379       break;
2380     case 32:
2381       assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2382       vptest(src1, src2, Assembler::AVX_256bit);
2383       break;
2384     case 64:
2385       {
2386         assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2387         evpcmpeqb(mask, src1, src2, Assembler::AVX_512bit);
2388         if (bt == BoolTest::ne) {
2389           ktestql(mask, mask);
2390         } else {
2391           assert(bt == BoolTest::overflow, "required");
2392           kortestql(mask, mask);
2393         }
2394       }
2395       break;
2396     default:
2397       assert(false,"Should not reach here.");
2398       break;
2399   }
2400 }
2401 
2402 //-------------------------------------------------------------------------------------------
2403 
2404 // IndexOf for constant substrings with size >= 8 chars
2405 // which don't need to be loaded through stack.
2406 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2407                                          Register cnt1, Register cnt2,
2408                                          int int_cnt2,  Register result,
2409                                          XMMRegister vec, Register tmp,
2410                                          int ae) {
2411   ShortBranchVerifier sbv(this);
2412   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2413   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2414 
2415   // This method uses the pcmpestri instruction with bound registers
2416   //   inputs:
2417   //     xmm - substring
2418   //     rax - substring length (elements count)
2419   //     mem - scanned string
2420   //     rdx - string length (elements count)
2421   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2422   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2423   //   outputs:
2424   //     rcx - matched index in string
2425   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2426   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2427   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2428   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2429   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2430 
2431   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2432         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2433         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2434 
2435   // Note, inline_string_indexOf() generates checks:
2436   // if (substr.count > string.count) return -1;
2437   // if (substr.count == 0) return 0;
2438   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2439 
2440   // Load substring.
2441   if (ae == StrIntrinsicNode::UL) {
2442     pmovzxbw(vec, Address(str2, 0));
2443   } else {
2444     movdqu(vec, Address(str2, 0));
2445   }
2446   movl(cnt2, int_cnt2);
2447   movptr(result, str1); // string addr
2448 
2449   if (int_cnt2 > stride) {
2450     jmpb(SCAN_TO_SUBSTR);
2451 
2452     // Reload substr for rescan, this code
2453     // is executed only for large substrings (> 8 chars)
2454     bind(RELOAD_SUBSTR);
2455     if (ae == StrIntrinsicNode::UL) {
2456       pmovzxbw(vec, Address(str2, 0));
2457     } else {
2458       movdqu(vec, Address(str2, 0));
2459     }
2460     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2461 
2462     bind(RELOAD_STR);
2463     // We came here after the beginning of the substring was
2464     // matched but the rest of it was not so we need to search
2465     // again. Start from the next element after the previous match.
2466 
2467     // cnt2 is number of substring reminding elements and
2468     // cnt1 is number of string reminding elements when cmp failed.
2469     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2470     subl(cnt1, cnt2);
2471     addl(cnt1, int_cnt2);
2472     movl(cnt2, int_cnt2); // Now restore cnt2
2473 
2474     decrementl(cnt1);     // Shift to next element
2475     cmpl(cnt1, cnt2);
2476     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2477 
2478     addptr(result, (1<<scale1));
2479 
2480   } // (int_cnt2 > 8)
2481 
2482   // Scan string for start of substr in 16-byte vectors
2483   bind(SCAN_TO_SUBSTR);
2484   pcmpestri(vec, Address(result, 0), mode);
2485   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2486   subl(cnt1, stride);
2487   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2488   cmpl(cnt1, cnt2);
2489   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2490   addptr(result, 16);
2491   jmpb(SCAN_TO_SUBSTR);
2492 
2493   // Found a potential substr
2494   bind(FOUND_CANDIDATE);
2495   // Matched whole vector if first element matched (tmp(rcx) == 0).
2496   if (int_cnt2 == stride) {
2497     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2498   } else { // int_cnt2 > 8
2499     jccb(Assembler::overflow, FOUND_SUBSTR);
2500   }
2501   // After pcmpestri tmp(rcx) contains matched element index
2502   // Compute start addr of substr
2503   lea(result, Address(result, tmp, scale1));
2504 
2505   // Make sure string is still long enough
2506   subl(cnt1, tmp);
2507   cmpl(cnt1, cnt2);
2508   if (int_cnt2 == stride) {
2509     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2510   } else { // int_cnt2 > 8
2511     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2512   }
2513   // Left less then substring.
2514 
2515   bind(RET_NOT_FOUND);
2516   movl(result, -1);
2517   jmp(EXIT);
2518 
2519   if (int_cnt2 > stride) {
2520     // This code is optimized for the case when whole substring
2521     // is matched if its head is matched.
2522     bind(MATCH_SUBSTR_HEAD);
2523     pcmpestri(vec, Address(result, 0), mode);
2524     // Reload only string if does not match
2525     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2526 
2527     Label CONT_SCAN_SUBSTR;
2528     // Compare the rest of substring (> 8 chars).
2529     bind(FOUND_SUBSTR);
2530     // First 8 chars are already matched.
2531     negptr(cnt2);
2532     addptr(cnt2, stride);
2533 
2534     bind(SCAN_SUBSTR);
2535     subl(cnt1, stride);
2536     cmpl(cnt2, -stride); // Do not read beyond substring
2537     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2538     // Back-up strings to avoid reading beyond substring:
2539     // cnt1 = cnt1 - cnt2 + 8
2540     addl(cnt1, cnt2); // cnt2 is negative
2541     addl(cnt1, stride);
2542     movl(cnt2, stride); negptr(cnt2);
2543     bind(CONT_SCAN_SUBSTR);
2544     if (int_cnt2 < (int)G) {
2545       int tail_off1 = int_cnt2<<scale1;
2546       int tail_off2 = int_cnt2<<scale2;
2547       if (ae == StrIntrinsicNode::UL) {
2548         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2549       } else {
2550         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2551       }
2552       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2553     } else {
2554       // calculate index in register to avoid integer overflow (int_cnt2*2)
2555       movl(tmp, int_cnt2);
2556       addptr(tmp, cnt2);
2557       if (ae == StrIntrinsicNode::UL) {
2558         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2559       } else {
2560         movdqu(vec, Address(str2, tmp, scale2, 0));
2561       }
2562       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2563     }
2564     // Need to reload strings pointers if not matched whole vector
2565     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2566     addptr(cnt2, stride);
2567     jcc(Assembler::negative, SCAN_SUBSTR);
2568     // Fall through if found full substring
2569 
2570   } // (int_cnt2 > 8)
2571 
2572   bind(RET_FOUND);
2573   // Found result if we matched full small substring.
2574   // Compute substr offset
2575   subptr(result, str1);
2576   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2577     shrl(result, 1); // index
2578   }
2579   bind(EXIT);
2580 
2581 } // string_indexofC8
2582 
2583 // Small strings are loaded through stack if they cross page boundary.
2584 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2585                                        Register cnt1, Register cnt2,
2586                                        int int_cnt2,  Register result,
2587                                        XMMRegister vec, Register tmp,
2588                                        int ae) {
2589   ShortBranchVerifier sbv(this);
2590   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2591   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2592 
2593   //
2594   // int_cnt2 is length of small (< 8 chars) constant substring
2595   // or (-1) for non constant substring in which case its length
2596   // is in cnt2 register.
2597   //
2598   // Note, inline_string_indexOf() generates checks:
2599   // if (substr.count > string.count) return -1;
2600   // if (substr.count == 0) return 0;
2601   //
2602   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2603   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2604   // This method uses the pcmpestri instruction with bound registers
2605   //   inputs:
2606   //     xmm - substring
2607   //     rax - substring length (elements count)
2608   //     mem - scanned string
2609   //     rdx - string length (elements count)
2610   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2611   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2612   //   outputs:
2613   //     rcx - matched index in string
2614   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2615   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2616   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2617   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2618 
2619   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2620         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2621         FOUND_CANDIDATE;
2622 
2623   { //========================================================
2624     // We don't know where these strings are located
2625     // and we can't read beyond them. Load them through stack.
2626     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2627 
2628     movptr(tmp, rsp); // save old SP
2629 
2630     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2631       if (int_cnt2 == (1>>scale2)) { // One byte
2632         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2633         load_unsigned_byte(result, Address(str2, 0));
2634         movdl(vec, result); // move 32 bits
2635       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2636         // Not enough header space in 32-bit VM: 12+3 = 15.
2637         movl(result, Address(str2, -1));
2638         shrl(result, 8);
2639         movdl(vec, result); // move 32 bits
2640       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2641         load_unsigned_short(result, Address(str2, 0));
2642         movdl(vec, result); // move 32 bits
2643       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2644         movdl(vec, Address(str2, 0)); // move 32 bits
2645       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2646         movq(vec, Address(str2, 0));  // move 64 bits
2647       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2648         // Array header size is 12 bytes in 32-bit VM
2649         // + 6 bytes for 3 chars == 18 bytes,
2650         // enough space to load vec and shift.
2651         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2652         if (ae == StrIntrinsicNode::UL) {
2653           int tail_off = int_cnt2-8;
2654           pmovzxbw(vec, Address(str2, tail_off));
2655           psrldq(vec, -2*tail_off);
2656         }
2657         else {
2658           int tail_off = int_cnt2*(1<<scale2);
2659           movdqu(vec, Address(str2, tail_off-16));
2660           psrldq(vec, 16-tail_off);
2661         }
2662       }
2663     } else { // not constant substring
2664       cmpl(cnt2, stride);
2665       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2666 
2667       // We can read beyond string if srt+16 does not cross page boundary
2668       // since heaps are aligned and mapped by pages.
2669       assert(os::vm_page_size() < (int)G, "default page should be small");
2670       movl(result, str2); // We need only low 32 bits
2671       andl(result, (os::vm_page_size()-1));
2672       cmpl(result, (os::vm_page_size()-16));
2673       jccb(Assembler::belowEqual, CHECK_STR);
2674 
2675       // Move small strings to stack to allow load 16 bytes into vec.
2676       subptr(rsp, 16);
2677       int stk_offset = wordSize-(1<<scale2);
2678       push(cnt2);
2679 
2680       bind(COPY_SUBSTR);
2681       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2682         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2683         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2684       } else if (ae == StrIntrinsicNode::UU) {
2685         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2686         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2687       }
2688       decrement(cnt2);
2689       jccb(Assembler::notZero, COPY_SUBSTR);
2690 
2691       pop(cnt2);
2692       movptr(str2, rsp);  // New substring address
2693     } // non constant
2694 
2695     bind(CHECK_STR);
2696     cmpl(cnt1, stride);
2697     jccb(Assembler::aboveEqual, BIG_STRINGS);
2698 
2699     // Check cross page boundary.
2700     movl(result, str1); // We need only low 32 bits
2701     andl(result, (os::vm_page_size()-1));
2702     cmpl(result, (os::vm_page_size()-16));
2703     jccb(Assembler::belowEqual, BIG_STRINGS);
2704 
2705     subptr(rsp, 16);
2706     int stk_offset = -(1<<scale1);
2707     if (int_cnt2 < 0) { // not constant
2708       push(cnt2);
2709       stk_offset += wordSize;
2710     }
2711     movl(cnt2, cnt1);
2712 
2713     bind(COPY_STR);
2714     if (ae == StrIntrinsicNode::LL) {
2715       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2716       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2717     } else {
2718       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2719       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2720     }
2721     decrement(cnt2);
2722     jccb(Assembler::notZero, COPY_STR);
2723 
2724     if (int_cnt2 < 0) { // not constant
2725       pop(cnt2);
2726     }
2727     movptr(str1, rsp);  // New string address
2728 
2729     bind(BIG_STRINGS);
2730     // Load substring.
2731     if (int_cnt2 < 0) { // -1
2732       if (ae == StrIntrinsicNode::UL) {
2733         pmovzxbw(vec, Address(str2, 0));
2734       } else {
2735         movdqu(vec, Address(str2, 0));
2736       }
2737       push(cnt2);       // substr count
2738       push(str2);       // substr addr
2739       push(str1);       // string addr
2740     } else {
2741       // Small (< 8 chars) constant substrings are loaded already.
2742       movl(cnt2, int_cnt2);
2743     }
2744     push(tmp);  // original SP
2745 
2746   } // Finished loading
2747 
2748   //========================================================
2749   // Start search
2750   //
2751 
2752   movptr(result, str1); // string addr
2753 
2754   if (int_cnt2  < 0) {  // Only for non constant substring
2755     jmpb(SCAN_TO_SUBSTR);
2756 
2757     // SP saved at sp+0
2758     // String saved at sp+1*wordSize
2759     // Substr saved at sp+2*wordSize
2760     // Substr count saved at sp+3*wordSize
2761 
2762     // Reload substr for rescan, this code
2763     // is executed only for large substrings (> 8 chars)
2764     bind(RELOAD_SUBSTR);
2765     movptr(str2, Address(rsp, 2*wordSize));
2766     movl(cnt2, Address(rsp, 3*wordSize));
2767     if (ae == StrIntrinsicNode::UL) {
2768       pmovzxbw(vec, Address(str2, 0));
2769     } else {
2770       movdqu(vec, Address(str2, 0));
2771     }
2772     // We came here after the beginning of the substring was
2773     // matched but the rest of it was not so we need to search
2774     // again. Start from the next element after the previous match.
2775     subptr(str1, result); // Restore counter
2776     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2777       shrl(str1, 1);
2778     }
2779     addl(cnt1, str1);
2780     decrementl(cnt1);   // Shift to next element
2781     cmpl(cnt1, cnt2);
2782     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2783 
2784     addptr(result, (1<<scale1));
2785   } // non constant
2786 
2787   // Scan string for start of substr in 16-byte vectors
2788   bind(SCAN_TO_SUBSTR);
2789   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2790   pcmpestri(vec, Address(result, 0), mode);
2791   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2792   subl(cnt1, stride);
2793   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2794   cmpl(cnt1, cnt2);
2795   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2796   addptr(result, 16);
2797 
2798   bind(ADJUST_STR);
2799   cmpl(cnt1, stride); // Do not read beyond string
2800   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2801   // Back-up string to avoid reading beyond string.
2802   lea(result, Address(result, cnt1, scale1, -16));
2803   movl(cnt1, stride);
2804   jmpb(SCAN_TO_SUBSTR);
2805 
2806   // Found a potential substr
2807   bind(FOUND_CANDIDATE);
2808   // After pcmpestri tmp(rcx) contains matched element index
2809 
2810   // Make sure string is still long enough
2811   subl(cnt1, tmp);
2812   cmpl(cnt1, cnt2);
2813   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
2814   // Left less then substring.
2815 
2816   bind(RET_NOT_FOUND);
2817   movl(result, -1);
2818   jmp(CLEANUP);
2819 
2820   bind(FOUND_SUBSTR);
2821   // Compute start addr of substr
2822   lea(result, Address(result, tmp, scale1));
2823   if (int_cnt2 > 0) { // Constant substring
2824     // Repeat search for small substring (< 8 chars)
2825     // from new point without reloading substring.
2826     // Have to check that we don't read beyond string.
2827     cmpl(tmp, stride-int_cnt2);
2828     jccb(Assembler::greater, ADJUST_STR);
2829     // Fall through if matched whole substring.
2830   } else { // non constant
2831     assert(int_cnt2 == -1, "should be != 0");
2832 
2833     addl(tmp, cnt2);
2834     // Found result if we matched whole substring.
2835     cmpl(tmp, stride);
2836     jcc(Assembler::lessEqual, RET_FOUND);
2837 
2838     // Repeat search for small substring (<= 8 chars)
2839     // from new point 'str1' without reloading substring.
2840     cmpl(cnt2, stride);
2841     // Have to check that we don't read beyond string.
2842     jccb(Assembler::lessEqual, ADJUST_STR);
2843 
2844     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
2845     // Compare the rest of substring (> 8 chars).
2846     movptr(str1, result);
2847 
2848     cmpl(tmp, cnt2);
2849     // First 8 chars are already matched.
2850     jccb(Assembler::equal, CHECK_NEXT);
2851 
2852     bind(SCAN_SUBSTR);
2853     pcmpestri(vec, Address(str1, 0), mode);
2854     // Need to reload strings pointers if not matched whole vector
2855     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2856 
2857     bind(CHECK_NEXT);
2858     subl(cnt2, stride);
2859     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
2860     addptr(str1, 16);
2861     if (ae == StrIntrinsicNode::UL) {
2862       addptr(str2, 8);
2863     } else {
2864       addptr(str2, 16);
2865     }
2866     subl(cnt1, stride);
2867     cmpl(cnt2, stride); // Do not read beyond substring
2868     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
2869     // Back-up strings to avoid reading beyond substring.
2870 
2871     if (ae == StrIntrinsicNode::UL) {
2872       lea(str2, Address(str2, cnt2, scale2, -8));
2873       lea(str1, Address(str1, cnt2, scale1, -16));
2874     } else {
2875       lea(str2, Address(str2, cnt2, scale2, -16));
2876       lea(str1, Address(str1, cnt2, scale1, -16));
2877     }
2878     subl(cnt1, cnt2);
2879     movl(cnt2, stride);
2880     addl(cnt1, stride);
2881     bind(CONT_SCAN_SUBSTR);
2882     if (ae == StrIntrinsicNode::UL) {
2883       pmovzxbw(vec, Address(str2, 0));
2884     } else {
2885       movdqu(vec, Address(str2, 0));
2886     }
2887     jmp(SCAN_SUBSTR);
2888 
2889     bind(RET_FOUND_LONG);
2890     movptr(str1, Address(rsp, wordSize));
2891   } // non constant
2892 
2893   bind(RET_FOUND);
2894   // Compute substr offset
2895   subptr(result, str1);
2896   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2897     shrl(result, 1); // index
2898   }
2899   bind(CLEANUP);
2900   pop(rsp); // restore SP
2901 
2902 } // string_indexof
2903 
2904 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
2905                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
2906   ShortBranchVerifier sbv(this);
2907   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2908 
2909   int stride = 8;
2910 
2911   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
2912         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
2913         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
2914         FOUND_SEQ_CHAR, DONE_LABEL;
2915 
2916   movptr(result, str1);
2917   if (UseAVX >= 2) {
2918     cmpl(cnt1, stride);
2919     jcc(Assembler::less, SCAN_TO_CHAR);
2920     cmpl(cnt1, 2*stride);
2921     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
2922     movdl(vec1, ch);
2923     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
2924     vpxor(vec2, vec2);
2925     movl(tmp, cnt1);
2926     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
2927     andl(cnt1,0x0000000F);  //tail count (in chars)
2928 
2929     bind(SCAN_TO_16_CHAR_LOOP);
2930     vmovdqu(vec3, Address(result, 0));
2931     vpcmpeqw(vec3, vec3, vec1, 1);
2932     vptest(vec2, vec3);
2933     jcc(Assembler::carryClear, FOUND_CHAR);
2934     addptr(result, 32);
2935     subl(tmp, 2*stride);
2936     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
2937     jmp(SCAN_TO_8_CHAR);
2938     bind(SCAN_TO_8_CHAR_INIT);
2939     movdl(vec1, ch);
2940     pshuflw(vec1, vec1, 0x00);
2941     pshufd(vec1, vec1, 0);
2942     pxor(vec2, vec2);
2943   }
2944   bind(SCAN_TO_8_CHAR);
2945   cmpl(cnt1, stride);
2946   jcc(Assembler::less, SCAN_TO_CHAR);
2947   if (UseAVX < 2) {
2948     movdl(vec1, ch);
2949     pshuflw(vec1, vec1, 0x00);
2950     pshufd(vec1, vec1, 0);
2951     pxor(vec2, vec2);
2952   }
2953   movl(tmp, cnt1);
2954   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
2955   andl(cnt1,0x00000007);  //tail count (in chars)
2956 
2957   bind(SCAN_TO_8_CHAR_LOOP);
2958   movdqu(vec3, Address(result, 0));
2959   pcmpeqw(vec3, vec1);
2960   ptest(vec2, vec3);
2961   jcc(Assembler::carryClear, FOUND_CHAR);
2962   addptr(result, 16);
2963   subl(tmp, stride);
2964   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
2965   bind(SCAN_TO_CHAR);
2966   testl(cnt1, cnt1);
2967   jcc(Assembler::zero, RET_NOT_FOUND);
2968   bind(SCAN_TO_CHAR_LOOP);
2969   load_unsigned_short(tmp, Address(result, 0));
2970   cmpl(ch, tmp);
2971   jccb(Assembler::equal, FOUND_SEQ_CHAR);
2972   addptr(result, 2);
2973   subl(cnt1, 1);
2974   jccb(Assembler::zero, RET_NOT_FOUND);
2975   jmp(SCAN_TO_CHAR_LOOP);
2976 
2977   bind(RET_NOT_FOUND);
2978   movl(result, -1);
2979   jmpb(DONE_LABEL);
2980 
2981   bind(FOUND_CHAR);
2982   if (UseAVX >= 2) {
2983     vpmovmskb(tmp, vec3);
2984   } else {
2985     pmovmskb(tmp, vec3);
2986   }
2987   bsfl(ch, tmp);
2988   addptr(result, ch);
2989 
2990   bind(FOUND_SEQ_CHAR);
2991   subptr(result, str1);
2992   shrl(result, 1);
2993 
2994   bind(DONE_LABEL);
2995 } // string_indexof_char
2996 
2997 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
2998                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
2999   ShortBranchVerifier sbv(this);
3000   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3001 
3002   int stride = 16;
3003 
3004   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3005         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3006         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3007         FOUND_SEQ_CHAR, DONE_LABEL;
3008 
3009   movptr(result, str1);
3010   if (UseAVX >= 2) {
3011     cmpl(cnt1, stride);
3012     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3013     cmpl(cnt1, stride*2);
3014     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3015     movdl(vec1, ch);
3016     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3017     vpxor(vec2, vec2);
3018     movl(tmp, cnt1);
3019     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3020     andl(cnt1,0x0000001F);  //tail count (in chars)
3021 
3022     bind(SCAN_TO_32_CHAR_LOOP);
3023     vmovdqu(vec3, Address(result, 0));
3024     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3025     vptest(vec2, vec3);
3026     jcc(Assembler::carryClear, FOUND_CHAR);
3027     addptr(result, 32);
3028     subl(tmp, stride*2);
3029     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3030     jmp(SCAN_TO_16_CHAR);
3031 
3032     bind(SCAN_TO_16_CHAR_INIT);
3033     movdl(vec1, ch);
3034     pxor(vec2, vec2);
3035     pshufb(vec1, vec2);
3036   }
3037 
3038   bind(SCAN_TO_16_CHAR);
3039   cmpl(cnt1, stride);
3040   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entires left
3041   if (UseAVX < 2) {
3042     movdl(vec1, ch);
3043     pxor(vec2, vec2);
3044     pshufb(vec1, vec2);
3045   }
3046   movl(tmp, cnt1);
3047   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3048   andl(cnt1,0x0000000F);  //tail count (in bytes)
3049 
3050   bind(SCAN_TO_16_CHAR_LOOP);
3051   movdqu(vec3, Address(result, 0));
3052   pcmpeqb(vec3, vec1);
3053   ptest(vec2, vec3);
3054   jcc(Assembler::carryClear, FOUND_CHAR);
3055   addptr(result, 16);
3056   subl(tmp, stride);
3057   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3058 
3059   bind(SCAN_TO_CHAR_INIT);
3060   testl(cnt1, cnt1);
3061   jcc(Assembler::zero, RET_NOT_FOUND);
3062   bind(SCAN_TO_CHAR_LOOP);
3063   load_unsigned_byte(tmp, Address(result, 0));
3064   cmpl(ch, tmp);
3065   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3066   addptr(result, 1);
3067   subl(cnt1, 1);
3068   jccb(Assembler::zero, RET_NOT_FOUND);
3069   jmp(SCAN_TO_CHAR_LOOP);
3070 
3071   bind(RET_NOT_FOUND);
3072   movl(result, -1);
3073   jmpb(DONE_LABEL);
3074 
3075   bind(FOUND_CHAR);
3076   if (UseAVX >= 2) {
3077     vpmovmskb(tmp, vec3);
3078   } else {
3079     pmovmskb(tmp, vec3);
3080   }
3081   bsfl(ch, tmp);
3082   addptr(result, ch);
3083 
3084   bind(FOUND_SEQ_CHAR);
3085   subptr(result, str1);
3086 
3087   bind(DONE_LABEL);
3088 } // stringL_indexof_char
3089 
3090 // helper function for string_compare
3091 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3092                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3093                                            Address::ScaleFactor scale2, Register index, int ae) {
3094   if (ae == StrIntrinsicNode::LL) {
3095     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3096     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3097   } else if (ae == StrIntrinsicNode::UU) {
3098     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3099     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3100   } else {
3101     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3102     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3103   }
3104 }
3105 
3106 // Compare strings, used for char[] and byte[].
3107 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3108                                        Register cnt1, Register cnt2, Register result,
3109                                        XMMRegister vec1, int ae, KRegister mask) {
3110   ShortBranchVerifier sbv(this);
3111   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3112   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3113   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3114   int stride2x2 = 0x40;
3115   Address::ScaleFactor scale = Address::no_scale;
3116   Address::ScaleFactor scale1 = Address::no_scale;
3117   Address::ScaleFactor scale2 = Address::no_scale;
3118 
3119   if (ae != StrIntrinsicNode::LL) {
3120     stride2x2 = 0x20;
3121   }
3122 
3123   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3124     shrl(cnt2, 1);
3125   }
3126   // Compute the minimum of the string lengths and the
3127   // difference of the string lengths (stack).
3128   // Do the conditional move stuff
3129   movl(result, cnt1);
3130   subl(cnt1, cnt2);
3131   push(cnt1);
3132   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3133 
3134   // Is the minimum length zero?
3135   testl(cnt2, cnt2);
3136   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3137   if (ae == StrIntrinsicNode::LL) {
3138     // Load first bytes
3139     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3140     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3141   } else if (ae == StrIntrinsicNode::UU) {
3142     // Load first characters
3143     load_unsigned_short(result, Address(str1, 0));
3144     load_unsigned_short(cnt1, Address(str2, 0));
3145   } else {
3146     load_unsigned_byte(result, Address(str1, 0));
3147     load_unsigned_short(cnt1, Address(str2, 0));
3148   }
3149   subl(result, cnt1);
3150   jcc(Assembler::notZero,  POP_LABEL);
3151 
3152   if (ae == StrIntrinsicNode::UU) {
3153     // Divide length by 2 to get number of chars
3154     shrl(cnt2, 1);
3155   }
3156   cmpl(cnt2, 1);
3157   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3158 
3159   // Check if the strings start at the same location and setup scale and stride
3160   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3161     cmpptr(str1, str2);
3162     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3163     if (ae == StrIntrinsicNode::LL) {
3164       scale = Address::times_1;
3165       stride = 16;
3166     } else {
3167       scale = Address::times_2;
3168       stride = 8;
3169     }
3170   } else {
3171     scale1 = Address::times_1;
3172     scale2 = Address::times_2;
3173     // scale not used
3174     stride = 8;
3175   }
3176 
3177   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3178     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3179     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3180     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3181     Label COMPARE_TAIL_LONG;
3182     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3183 
3184     int pcmpmask = 0x19;
3185     if (ae == StrIntrinsicNode::LL) {
3186       pcmpmask &= ~0x01;
3187     }
3188 
3189     // Setup to compare 16-chars (32-bytes) vectors,
3190     // start from first character again because it has aligned address.
3191     if (ae == StrIntrinsicNode::LL) {
3192       stride2 = 32;
3193     } else {
3194       stride2 = 16;
3195     }
3196     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3197       adr_stride = stride << scale;
3198     } else {
3199       adr_stride1 = 8;  //stride << scale1;
3200       adr_stride2 = 16; //stride << scale2;
3201     }
3202 
3203     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3204     // rax and rdx are used by pcmpestri as elements counters
3205     movl(result, cnt2);
3206     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3207     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3208 
3209     // fast path : compare first 2 8-char vectors.
3210     bind(COMPARE_16_CHARS);
3211     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3212       movdqu(vec1, Address(str1, 0));
3213     } else {
3214       pmovzxbw(vec1, Address(str1, 0));
3215     }
3216     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3217     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3218 
3219     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3220       movdqu(vec1, Address(str1, adr_stride));
3221       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3222     } else {
3223       pmovzxbw(vec1, Address(str1, adr_stride1));
3224       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3225     }
3226     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3227     addl(cnt1, stride);
3228 
3229     // Compare the characters at index in cnt1
3230     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3231     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3232     subl(result, cnt2);
3233     jmp(POP_LABEL);
3234 
3235     // Setup the registers to start vector comparison loop
3236     bind(COMPARE_WIDE_VECTORS);
3237     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3238       lea(str1, Address(str1, result, scale));
3239       lea(str2, Address(str2, result, scale));
3240     } else {
3241       lea(str1, Address(str1, result, scale1));
3242       lea(str2, Address(str2, result, scale2));
3243     }
3244     subl(result, stride2);
3245     subl(cnt2, stride2);
3246     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3247     negptr(result);
3248 
3249     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3250     bind(COMPARE_WIDE_VECTORS_LOOP);
3251 
3252 #ifdef _LP64
3253     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3254       cmpl(cnt2, stride2x2);
3255       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3256       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3257       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3258 
3259       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3260       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3261         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3262         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3263       } else {
3264         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3265         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3266       }
3267       kortestql(mask, mask);
3268       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3269       addptr(result, stride2x2);  // update since we already compared at this addr
3270       subl(cnt2, stride2x2);      // and sub the size too
3271       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3272 
3273       vpxor(vec1, vec1);
3274       jmpb(COMPARE_WIDE_TAIL);
3275     }//if (VM_Version::supports_avx512vlbw())
3276 #endif // _LP64
3277 
3278 
3279     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3280     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3281       vmovdqu(vec1, Address(str1, result, scale));
3282       vpxor(vec1, Address(str2, result, scale));
3283     } else {
3284       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3285       vpxor(vec1, Address(str2, result, scale2));
3286     }
3287     vptest(vec1, vec1);
3288     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3289     addptr(result, stride2);
3290     subl(cnt2, stride2);
3291     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3292     // clean upper bits of YMM registers
3293     vpxor(vec1, vec1);
3294 
3295     // compare wide vectors tail
3296     bind(COMPARE_WIDE_TAIL);
3297     testptr(result, result);
3298     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3299 
3300     movl(result, stride2);
3301     movl(cnt2, result);
3302     negptr(result);
3303     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3304 
3305     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3306     bind(VECTOR_NOT_EQUAL);
3307     // clean upper bits of YMM registers
3308     vpxor(vec1, vec1);
3309     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3310       lea(str1, Address(str1, result, scale));
3311       lea(str2, Address(str2, result, scale));
3312     } else {
3313       lea(str1, Address(str1, result, scale1));
3314       lea(str2, Address(str2, result, scale2));
3315     }
3316     jmp(COMPARE_16_CHARS);
3317 
3318     // Compare tail chars, length between 1 to 15 chars
3319     bind(COMPARE_TAIL_LONG);
3320     movl(cnt2, result);
3321     cmpl(cnt2, stride);
3322     jcc(Assembler::less, COMPARE_SMALL_STR);
3323 
3324     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3325       movdqu(vec1, Address(str1, 0));
3326     } else {
3327       pmovzxbw(vec1, Address(str1, 0));
3328     }
3329     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3330     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3331     subptr(cnt2, stride);
3332     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3333     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3334       lea(str1, Address(str1, result, scale));
3335       lea(str2, Address(str2, result, scale));
3336     } else {
3337       lea(str1, Address(str1, result, scale1));
3338       lea(str2, Address(str2, result, scale2));
3339     }
3340     negptr(cnt2);
3341     jmpb(WHILE_HEAD_LABEL);
3342 
3343     bind(COMPARE_SMALL_STR);
3344   } else if (UseSSE42Intrinsics) {
3345     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3346     int pcmpmask = 0x19;
3347     // Setup to compare 8-char (16-byte) vectors,
3348     // start from first character again because it has aligned address.
3349     movl(result, cnt2);
3350     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3351     if (ae == StrIntrinsicNode::LL) {
3352       pcmpmask &= ~0x01;
3353     }
3354     jcc(Assembler::zero, COMPARE_TAIL);
3355     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3356       lea(str1, Address(str1, result, scale));
3357       lea(str2, Address(str2, result, scale));
3358     } else {
3359       lea(str1, Address(str1, result, scale1));
3360       lea(str2, Address(str2, result, scale2));
3361     }
3362     negptr(result);
3363 
3364     // pcmpestri
3365     //   inputs:
3366     //     vec1- substring
3367     //     rax - negative string length (elements count)
3368     //     mem - scanned string
3369     //     rdx - string length (elements count)
3370     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3371     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3372     //   outputs:
3373     //     rcx - first mismatched element index
3374     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3375 
3376     bind(COMPARE_WIDE_VECTORS);
3377     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3378       movdqu(vec1, Address(str1, result, scale));
3379       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3380     } else {
3381       pmovzxbw(vec1, Address(str1, result, scale1));
3382       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3383     }
3384     // After pcmpestri cnt1(rcx) contains mismatched element index
3385 
3386     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3387     addptr(result, stride);
3388     subptr(cnt2, stride);
3389     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3390 
3391     // compare wide vectors tail
3392     testptr(result, result);
3393     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3394 
3395     movl(cnt2, stride);
3396     movl(result, stride);
3397     negptr(result);
3398     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3399       movdqu(vec1, Address(str1, result, scale));
3400       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3401     } else {
3402       pmovzxbw(vec1, Address(str1, result, scale1));
3403       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3404     }
3405     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3406 
3407     // Mismatched characters in the vectors
3408     bind(VECTOR_NOT_EQUAL);
3409     addptr(cnt1, result);
3410     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3411     subl(result, cnt2);
3412     jmpb(POP_LABEL);
3413 
3414     bind(COMPARE_TAIL); // limit is zero
3415     movl(cnt2, result);
3416     // Fallthru to tail compare
3417   }
3418   // Shift str2 and str1 to the end of the arrays, negate min
3419   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3420     lea(str1, Address(str1, cnt2, scale));
3421     lea(str2, Address(str2, cnt2, scale));
3422   } else {
3423     lea(str1, Address(str1, cnt2, scale1));
3424     lea(str2, Address(str2, cnt2, scale2));
3425   }
3426   decrementl(cnt2);  // first character was compared already
3427   negptr(cnt2);
3428 
3429   // Compare the rest of the elements
3430   bind(WHILE_HEAD_LABEL);
3431   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3432   subl(result, cnt1);
3433   jccb(Assembler::notZero, POP_LABEL);
3434   increment(cnt2);
3435   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3436 
3437   // Strings are equal up to min length.  Return the length difference.
3438   bind(LENGTH_DIFF_LABEL);
3439   pop(result);
3440   if (ae == StrIntrinsicNode::UU) {
3441     // Divide diff by 2 to get number of chars
3442     sarl(result, 1);
3443   }
3444   jmpb(DONE_LABEL);
3445 
3446 #ifdef _LP64
3447   if (VM_Version::supports_avx512vlbw()) {
3448 
3449     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3450 
3451     kmovql(cnt1, mask);
3452     notq(cnt1);
3453     bsfq(cnt2, cnt1);
3454     if (ae != StrIntrinsicNode::LL) {
3455       // Divide diff by 2 to get number of chars
3456       sarl(cnt2, 1);
3457     }
3458     addq(result, cnt2);
3459     if (ae == StrIntrinsicNode::LL) {
3460       load_unsigned_byte(cnt1, Address(str2, result));
3461       load_unsigned_byte(result, Address(str1, result));
3462     } else if (ae == StrIntrinsicNode::UU) {
3463       load_unsigned_short(cnt1, Address(str2, result, scale));
3464       load_unsigned_short(result, Address(str1, result, scale));
3465     } else {
3466       load_unsigned_short(cnt1, Address(str2, result, scale2));
3467       load_unsigned_byte(result, Address(str1, result, scale1));
3468     }
3469     subl(result, cnt1);
3470     jmpb(POP_LABEL);
3471   }//if (VM_Version::supports_avx512vlbw())
3472 #endif // _LP64
3473 
3474   // Discard the stored length difference
3475   bind(POP_LABEL);
3476   pop(cnt1);
3477 
3478   // That's it
3479   bind(DONE_LABEL);
3480   if(ae == StrIntrinsicNode::UL) {
3481     negl(result);
3482   }
3483 
3484 }
3485 
3486 // Search for Non-ASCII character (Negative byte value) in a byte array,
3487 // return true if it has any and false otherwise.
3488 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3489 //   @IntrinsicCandidate
3490 //   private static boolean hasNegatives(byte[] ba, int off, int len) {
3491 //     for (int i = off; i < off + len; i++) {
3492 //       if (ba[i] < 0) {
3493 //         return true;
3494 //       }
3495 //     }
3496 //     return false;
3497 //   }
3498 void C2_MacroAssembler::has_negatives(Register ary1, Register len,
3499   Register result, Register tmp1,
3500   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3501   // rsi: byte array
3502   // rcx: len
3503   // rax: result
3504   ShortBranchVerifier sbv(this);
3505   assert_different_registers(ary1, len, result, tmp1);
3506   assert_different_registers(vec1, vec2);
3507   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3508 
3509   // len == 0
3510   testl(len, len);
3511   jcc(Assembler::zero, FALSE_LABEL);
3512 
3513   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3514     VM_Version::supports_avx512vlbw() &&
3515     VM_Version::supports_bmi2()) {
3516 
3517     Label test_64_loop, test_tail;
3518     Register tmp3_aliased = len;
3519 
3520     movl(tmp1, len);
3521     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3522 
3523     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
3524     andl(len, ~(64 - 1));    // vector count (in chars)
3525     jccb(Assembler::zero, test_tail);
3526 
3527     lea(ary1, Address(ary1, len, Address::times_1));
3528     negptr(len);
3529 
3530     bind(test_64_loop);
3531     // Check whether our 64 elements of size byte contain negatives
3532     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3533     kortestql(mask1, mask1);
3534     jcc(Assembler::notZero, TRUE_LABEL);
3535 
3536     addptr(len, 64);
3537     jccb(Assembler::notZero, test_64_loop);
3538 
3539 
3540     bind(test_tail);
3541     // bail out when there is nothing to be done
3542     testl(tmp1, -1);
3543     jcc(Assembler::zero, FALSE_LABEL);
3544 
3545     // ~(~0 << len) applied up to two times (for 32-bit scenario)
3546 #ifdef _LP64
3547     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3548     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3549     notq(tmp3_aliased);
3550     kmovql(mask2, tmp3_aliased);
3551 #else
3552     Label k_init;
3553     jmp(k_init);
3554 
3555     // We could not read 64-bits from a general purpose register thus we move
3556     // data required to compose 64 1's to the instruction stream
3557     // We emit 64 byte wide series of elements from 0..63 which later on would
3558     // be used as a compare targets with tail count contained in tmp1 register.
3559     // Result would be a k register having tmp1 consecutive number or 1
3560     // counting from least significant bit.
3561     address tmp = pc();
3562     emit_int64(0x0706050403020100);
3563     emit_int64(0x0F0E0D0C0B0A0908);
3564     emit_int64(0x1716151413121110);
3565     emit_int64(0x1F1E1D1C1B1A1918);
3566     emit_int64(0x2726252423222120);
3567     emit_int64(0x2F2E2D2C2B2A2928);
3568     emit_int64(0x3736353433323130);
3569     emit_int64(0x3F3E3D3C3B3A3938);
3570 
3571     bind(k_init);
3572     lea(len, InternalAddress(tmp));
3573     // create mask to test for negative byte inside a vector
3574     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3575     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
3576 
3577 #endif
3578     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3579     ktestq(mask1, mask2);
3580     jcc(Assembler::notZero, TRUE_LABEL);
3581 
3582     jmp(FALSE_LABEL);
3583   } else {
3584     movl(result, len); // copy
3585 
3586     if (UseAVX >= 2 && UseSSE >= 2) {
3587       // With AVX2, use 32-byte vector compare
3588       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3589 
3590       // Compare 32-byte vectors
3591       andl(result, 0x0000001f);  //   tail count (in bytes)
3592       andl(len, 0xffffffe0);   // vector count (in bytes)
3593       jccb(Assembler::zero, COMPARE_TAIL);
3594 
3595       lea(ary1, Address(ary1, len, Address::times_1));
3596       negptr(len);
3597 
3598       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
3599       movdl(vec2, tmp1);
3600       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
3601 
3602       bind(COMPARE_WIDE_VECTORS);
3603       vmovdqu(vec1, Address(ary1, len, Address::times_1));
3604       vptest(vec1, vec2);
3605       jccb(Assembler::notZero, TRUE_LABEL);
3606       addptr(len, 32);
3607       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3608 
3609       testl(result, result);
3610       jccb(Assembler::zero, FALSE_LABEL);
3611 
3612       vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3613       vptest(vec1, vec2);
3614       jccb(Assembler::notZero, TRUE_LABEL);
3615       jmpb(FALSE_LABEL);
3616 
3617       bind(COMPARE_TAIL); // len is zero
3618       movl(len, result);
3619       // Fallthru to tail compare
3620     } else if (UseSSE42Intrinsics) {
3621       // With SSE4.2, use double quad vector compare
3622       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3623 
3624       // Compare 16-byte vectors
3625       andl(result, 0x0000000f);  //   tail count (in bytes)
3626       andl(len, 0xfffffff0);   // vector count (in bytes)
3627       jcc(Assembler::zero, COMPARE_TAIL);
3628 
3629       lea(ary1, Address(ary1, len, Address::times_1));
3630       negptr(len);
3631 
3632       movl(tmp1, 0x80808080);
3633       movdl(vec2, tmp1);
3634       pshufd(vec2, vec2, 0);
3635 
3636       bind(COMPARE_WIDE_VECTORS);
3637       movdqu(vec1, Address(ary1, len, Address::times_1));
3638       ptest(vec1, vec2);
3639       jcc(Assembler::notZero, TRUE_LABEL);
3640       addptr(len, 16);
3641       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3642 
3643       testl(result, result);
3644       jcc(Assembler::zero, FALSE_LABEL);
3645 
3646       movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3647       ptest(vec1, vec2);
3648       jccb(Assembler::notZero, TRUE_LABEL);
3649       jmpb(FALSE_LABEL);
3650 
3651       bind(COMPARE_TAIL); // len is zero
3652       movl(len, result);
3653       // Fallthru to tail compare
3654     }
3655   }
3656   // Compare 4-byte vectors
3657   andl(len, 0xfffffffc); // vector count (in bytes)
3658   jccb(Assembler::zero, COMPARE_CHAR);
3659 
3660   lea(ary1, Address(ary1, len, Address::times_1));
3661   negptr(len);
3662 
3663   bind(COMPARE_VECTORS);
3664   movl(tmp1, Address(ary1, len, Address::times_1));
3665   andl(tmp1, 0x80808080);
3666   jccb(Assembler::notZero, TRUE_LABEL);
3667   addptr(len, 4);
3668   jcc(Assembler::notZero, COMPARE_VECTORS);
3669 
3670   // Compare trailing char (final 2 bytes), if any
3671   bind(COMPARE_CHAR);
3672   testl(result, 0x2);   // tail  char
3673   jccb(Assembler::zero, COMPARE_BYTE);
3674   load_unsigned_short(tmp1, Address(ary1, 0));
3675   andl(tmp1, 0x00008080);
3676   jccb(Assembler::notZero, TRUE_LABEL);
3677   subptr(result, 2);
3678   lea(ary1, Address(ary1, 2));
3679 
3680   bind(COMPARE_BYTE);
3681   testl(result, 0x1);   // tail  byte
3682   jccb(Assembler::zero, FALSE_LABEL);
3683   load_unsigned_byte(tmp1, Address(ary1, 0));
3684   andl(tmp1, 0x00000080);
3685   jccb(Assembler::notEqual, TRUE_LABEL);
3686   jmpb(FALSE_LABEL);
3687 
3688   bind(TRUE_LABEL);
3689   movl(result, 1);   // return true
3690   jmpb(DONE);
3691 
3692   bind(FALSE_LABEL);
3693   xorl(result, result); // return false
3694 
3695   // That's it
3696   bind(DONE);
3697   if (UseAVX >= 2 && UseSSE >= 2) {
3698     // clean upper bits of YMM registers
3699     vpxor(vec1, vec1);
3700     vpxor(vec2, vec2);
3701   }
3702 }
3703 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
3704 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
3705                                       Register limit, Register result, Register chr,
3706                                       XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
3707   ShortBranchVerifier sbv(this);
3708   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
3709 
3710   int length_offset  = arrayOopDesc::length_offset_in_bytes();
3711   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
3712 
3713   if (is_array_equ) {
3714     // Check the input args
3715     cmpoop(ary1, ary2);
3716     jcc(Assembler::equal, TRUE_LABEL);
3717 
3718     // Need additional checks for arrays_equals.
3719     testptr(ary1, ary1);
3720     jcc(Assembler::zero, FALSE_LABEL);
3721     testptr(ary2, ary2);
3722     jcc(Assembler::zero, FALSE_LABEL);
3723 
3724     // Check the lengths
3725     movl(limit, Address(ary1, length_offset));
3726     cmpl(limit, Address(ary2, length_offset));
3727     jcc(Assembler::notEqual, FALSE_LABEL);
3728   }
3729 
3730   // count == 0
3731   testl(limit, limit);
3732   jcc(Assembler::zero, TRUE_LABEL);
3733 
3734   if (is_array_equ) {
3735     // Load array address
3736     lea(ary1, Address(ary1, base_offset));
3737     lea(ary2, Address(ary2, base_offset));
3738   }
3739 
3740   if (is_array_equ && is_char) {
3741     // arrays_equals when used for char[].
3742     shll(limit, 1);      // byte count != 0
3743   }
3744   movl(result, limit); // copy
3745 
3746   if (UseAVX >= 2) {
3747     // With AVX2, use 32-byte vector compare
3748     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3749 
3750     // Compare 32-byte vectors
3751     andl(result, 0x0000001f);  //   tail count (in bytes)
3752     andl(limit, 0xffffffe0);   // vector count (in bytes)
3753     jcc(Assembler::zero, COMPARE_TAIL);
3754 
3755     lea(ary1, Address(ary1, limit, Address::times_1));
3756     lea(ary2, Address(ary2, limit, Address::times_1));
3757     negptr(limit);
3758 
3759 #ifdef _LP64
3760     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3761       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
3762 
3763       cmpl(limit, -64);
3764       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3765 
3766       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3767 
3768       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
3769       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
3770       kortestql(mask, mask);
3771       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3772       addptr(limit, 64);  // update since we already compared at this addr
3773       cmpl(limit, -64);
3774       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3775 
3776       // At this point we may still need to compare -limit+result bytes.
3777       // We could execute the next two instruction and just continue via non-wide path:
3778       //  cmpl(limit, 0);
3779       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
3780       // But since we stopped at the points ary{1,2}+limit which are
3781       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
3782       // (|limit| <= 32 and result < 32),
3783       // we may just compare the last 64 bytes.
3784       //
3785       addptr(result, -64);   // it is safe, bc we just came from this area
3786       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
3787       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
3788       kortestql(mask, mask);
3789       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3790 
3791       jmp(TRUE_LABEL);
3792 
3793       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3794 
3795     }//if (VM_Version::supports_avx512vlbw())
3796 #endif //_LP64
3797     bind(COMPARE_WIDE_VECTORS);
3798     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
3799     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
3800     vpxor(vec1, vec2);
3801 
3802     vptest(vec1, vec1);
3803     jcc(Assembler::notZero, FALSE_LABEL);
3804     addptr(limit, 32);
3805     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3806 
3807     testl(result, result);
3808     jcc(Assembler::zero, TRUE_LABEL);
3809 
3810     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3811     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
3812     vpxor(vec1, vec2);
3813 
3814     vptest(vec1, vec1);
3815     jccb(Assembler::notZero, FALSE_LABEL);
3816     jmpb(TRUE_LABEL);
3817 
3818     bind(COMPARE_TAIL); // limit is zero
3819     movl(limit, result);
3820     // Fallthru to tail compare
3821   } else if (UseSSE42Intrinsics) {
3822     // With SSE4.2, use double quad vector compare
3823     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3824 
3825     // Compare 16-byte vectors
3826     andl(result, 0x0000000f);  //   tail count (in bytes)
3827     andl(limit, 0xfffffff0);   // vector count (in bytes)
3828     jcc(Assembler::zero, COMPARE_TAIL);
3829 
3830     lea(ary1, Address(ary1, limit, Address::times_1));
3831     lea(ary2, Address(ary2, limit, Address::times_1));
3832     negptr(limit);
3833 
3834     bind(COMPARE_WIDE_VECTORS);
3835     movdqu(vec1, Address(ary1, limit, Address::times_1));
3836     movdqu(vec2, Address(ary2, limit, Address::times_1));
3837     pxor(vec1, vec2);
3838 
3839     ptest(vec1, vec1);
3840     jcc(Assembler::notZero, FALSE_LABEL);
3841     addptr(limit, 16);
3842     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3843 
3844     testl(result, result);
3845     jcc(Assembler::zero, TRUE_LABEL);
3846 
3847     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3848     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
3849     pxor(vec1, vec2);
3850 
3851     ptest(vec1, vec1);
3852     jccb(Assembler::notZero, FALSE_LABEL);
3853     jmpb(TRUE_LABEL);
3854 
3855     bind(COMPARE_TAIL); // limit is zero
3856     movl(limit, result);
3857     // Fallthru to tail compare
3858   }
3859 
3860   // Compare 4-byte vectors
3861   andl(limit, 0xfffffffc); // vector count (in bytes)
3862   jccb(Assembler::zero, COMPARE_CHAR);
3863 
3864   lea(ary1, Address(ary1, limit, Address::times_1));
3865   lea(ary2, Address(ary2, limit, Address::times_1));
3866   negptr(limit);
3867 
3868   bind(COMPARE_VECTORS);
3869   movl(chr, Address(ary1, limit, Address::times_1));
3870   cmpl(chr, Address(ary2, limit, Address::times_1));
3871   jccb(Assembler::notEqual, FALSE_LABEL);
3872   addptr(limit, 4);
3873   jcc(Assembler::notZero, COMPARE_VECTORS);
3874 
3875   // Compare trailing char (final 2 bytes), if any
3876   bind(COMPARE_CHAR);
3877   testl(result, 0x2);   // tail  char
3878   jccb(Assembler::zero, COMPARE_BYTE);
3879   load_unsigned_short(chr, Address(ary1, 0));
3880   load_unsigned_short(limit, Address(ary2, 0));
3881   cmpl(chr, limit);
3882   jccb(Assembler::notEqual, FALSE_LABEL);
3883 
3884   if (is_array_equ && is_char) {
3885     bind(COMPARE_BYTE);
3886   } else {
3887     lea(ary1, Address(ary1, 2));
3888     lea(ary2, Address(ary2, 2));
3889 
3890     bind(COMPARE_BYTE);
3891     testl(result, 0x1);   // tail  byte
3892     jccb(Assembler::zero, TRUE_LABEL);
3893     load_unsigned_byte(chr, Address(ary1, 0));
3894     load_unsigned_byte(limit, Address(ary2, 0));
3895     cmpl(chr, limit);
3896     jccb(Assembler::notEqual, FALSE_LABEL);
3897   }
3898   bind(TRUE_LABEL);
3899   movl(result, 1);   // return true
3900   jmpb(DONE);
3901 
3902   bind(FALSE_LABEL);
3903   xorl(result, result); // return false
3904 
3905   // That's it
3906   bind(DONE);
3907   if (UseAVX >= 2) {
3908     // clean upper bits of YMM registers
3909     vpxor(vec1, vec1);
3910     vpxor(vec2, vec2);
3911   }
3912 }
3913 
3914 #ifdef _LP64
3915 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
3916                                               Register tmp, KRegister ktmp, int masklen, int vec_enc) {
3917   assert(VM_Version::supports_avx512vlbw(), "");
3918   vpxor(xtmp, xtmp, xtmp, vec_enc);
3919   vpsubb(xtmp, xtmp, mask, vec_enc);
3920   evpmovb2m(ktmp, xtmp, vec_enc);
3921   kmovql(tmp, ktmp);
3922   switch(opc) {
3923     case Op_VectorMaskTrueCount:
3924       popcntq(dst, tmp);
3925       break;
3926     case Op_VectorMaskLastTrue:
3927       mov64(dst, -1);
3928       bsrq(tmp, tmp);
3929       cmov(Assembler::notZero, dst, tmp);
3930       break;
3931     case Op_VectorMaskFirstTrue:
3932       mov64(dst, masklen);
3933       bsfq(tmp, tmp);
3934       cmov(Assembler::notZero, dst, tmp);
3935       break;
3936     default: assert(false, "Unhandled mask operation");
3937   }
3938 }
3939 
3940 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
3941                                               XMMRegister xtmp1, Register tmp, int masklen, int vec_enc) {
3942   assert(VM_Version::supports_avx(), "");
3943   vpxor(xtmp, xtmp, xtmp, vec_enc);
3944   vpsubb(xtmp, xtmp, mask, vec_enc);
3945   vpmovmskb(tmp, xtmp, vec_enc);
3946   if (masklen < 64) {
3947     andq(tmp, (((jlong)1 << masklen) - 1));
3948   }
3949   switch(opc) {
3950     case Op_VectorMaskTrueCount:
3951       popcntq(dst, tmp);
3952       break;
3953     case Op_VectorMaskLastTrue:
3954       mov64(dst, -1);
3955       bsrq(tmp, tmp);
3956       cmov(Assembler::notZero, dst, tmp);
3957       break;
3958     case Op_VectorMaskFirstTrue:
3959       mov64(dst, masklen);
3960       bsfq(tmp, tmp);
3961       cmov(Assembler::notZero, dst, tmp);
3962       break;
3963     default: assert(false, "Unhandled mask operation");
3964   }
3965 }
3966 #endif
3967 
3968 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
3969                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
3970                                         int vlen_enc) {
3971   assert(VM_Version::supports_avx512bw(), "");
3972   // Byte shuffles are inlane operations and indices are determined using
3973   // lower 4 bit of each shuffle lane, thus all shuffle indices are
3974   // normalized to index range 0-15. This makes sure that all the multiples
3975   // of an index value are placed at same relative position in 128 bit
3976   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
3977   // will be 16th element in their respective 128 bit lanes.
3978   movl(rtmp, 16);
3979   evpbroadcastb(xtmp1, rtmp, vlen_enc);
3980 
3981   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
3982   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
3983   // original shuffle indices and move the shuffled lanes corresponding to true
3984   // mask to destination vector.
3985   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
3986   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
3987   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
3988 
3989   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
3990   // and broadcasting second 128 bit lane.
3991   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
3992   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
3993   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
3994   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
3995   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
3996 
3997   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
3998   // and broadcasting third 128 bit lane.
3999   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
4000   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
4001   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
4002   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
4003   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
4004 
4005   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
4006   // and broadcasting third 128 bit lane.
4007   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
4008   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
4009   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
4010   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
4011   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
4012 }