1 /*
   2  * Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "oops/methodData.hpp"
  29 #include "opto/c2_MacroAssembler.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/opcodes.hpp"
  32 #include "opto/subnode.hpp"
  33 #include "runtime/biasedLocking.hpp"
  34 #include "runtime/objectMonitor.hpp"
  35 #include "runtime/stubRoutines.hpp"
  36 
  37 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
  38   switch (vlen_in_bytes) {
  39     case  4: // fall-through
  40     case  8: // fall-through
  41     case 16: return Assembler::AVX_128bit;
  42     case 32: return Assembler::AVX_256bit;
  43     case 64: return Assembler::AVX_512bit;
  44 
  45     default: {
  46       ShouldNotReachHere();
  47       return Assembler::AVX_NoVec;
  48     }
  49   }
  50 }
  51 
  52 void C2_MacroAssembler::setvectmask(Register dst, Register src, KRegister mask) {
  53   guarantee(PostLoopMultiversioning, "must be");
  54   Assembler::movl(dst, 1);
  55   Assembler::shlxl(dst, dst, src);
  56   Assembler::decl(dst);
  57   Assembler::kmovdl(mask, dst);
  58   Assembler::movl(dst, src);
  59 }
  60 
  61 void C2_MacroAssembler::restorevectmask(KRegister mask) {
  62   guarantee(PostLoopMultiversioning, "must be");
  63   Assembler::knotwl(mask, k0);
  64 }
  65 
  66 #if INCLUDE_RTM_OPT
  67 
  68 // Update rtm_counters based on abort status
  69 // input: abort_status
  70 //        rtm_counters (RTMLockingCounters*)
  71 // flags are killed
  72 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
  73 
  74   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
  75   if (PrintPreciseRTMLockingStatistics) {
  76     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
  77       Label check_abort;
  78       testl(abort_status, (1<<i));
  79       jccb(Assembler::equal, check_abort);
  80       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
  81       bind(check_abort);
  82     }
  83   }
  84 }
  85 
  86 // Branch if (random & (count-1) != 0), count is 2^n
  87 // tmp, scr and flags are killed
  88 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
  89   assert(tmp == rax, "");
  90   assert(scr == rdx, "");
  91   rdtsc(); // modifies EDX:EAX
  92   andptr(tmp, count-1);
  93   jccb(Assembler::notZero, brLabel);
  94 }
  95 
  96 // Perform abort ratio calculation, set no_rtm bit if high ratio
  97 // input:  rtm_counters_Reg (RTMLockingCounters* address)
  98 // tmpReg, rtm_counters_Reg and flags are killed
  99 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
 100                                                     Register rtm_counters_Reg,
 101                                                     RTMLockingCounters* rtm_counters,
 102                                                     Metadata* method_data) {
 103   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 104 
 105   if (RTMLockingCalculationDelay > 0) {
 106     // Delay calculation
 107     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
 108     testptr(tmpReg, tmpReg);
 109     jccb(Assembler::equal, L_done);
 110   }
 111   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 112   //   Aborted transactions = abort_count * 100
 113   //   All transactions = total_count *  RTMTotalCountIncrRate
 114   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 115 
 116   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 117   cmpptr(tmpReg, RTMAbortThreshold);
 118   jccb(Assembler::below, L_check_always_rtm2);
 119   imulptr(tmpReg, tmpReg, 100);
 120 
 121   Register scrReg = rtm_counters_Reg;
 122   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 123   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 124   imulptr(scrReg, scrReg, RTMAbortRatio);
 125   cmpptr(tmpReg, scrReg);
 126   jccb(Assembler::below, L_check_always_rtm1);
 127   if (method_data != NULL) {
 128     // set rtm_state to "no rtm" in MDO
 129     mov_metadata(tmpReg, method_data);
 130     lock();
 131     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
 132   }
 133   jmpb(L_done);
 134   bind(L_check_always_rtm1);
 135   // Reload RTMLockingCounters* address
 136   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 137   bind(L_check_always_rtm2);
 138   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 139   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 140   jccb(Assembler::below, L_done);
 141   if (method_data != NULL) {
 142     // set rtm_state to "always rtm" in MDO
 143     mov_metadata(tmpReg, method_data);
 144     lock();
 145     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
 146   }
 147   bind(L_done);
 148 }
 149 
 150 // Update counters and perform abort ratio calculation
 151 // input:  abort_status_Reg
 152 // rtm_counters_Reg, flags are killed
 153 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 154                                       Register rtm_counters_Reg,
 155                                       RTMLockingCounters* rtm_counters,
 156                                       Metadata* method_data,
 157                                       bool profile_rtm) {
 158 
 159   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 160   // update rtm counters based on rax value at abort
 161   // reads abort_status_Reg, updates flags
 162   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 163   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 164   if (profile_rtm) {
 165     // Save abort status because abort_status_Reg is used by following code.
 166     if (RTMRetryCount > 0) {
 167       push(abort_status_Reg);
 168     }
 169     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 170     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 171     // restore abort status
 172     if (RTMRetryCount > 0) {
 173       pop(abort_status_Reg);
 174     }
 175   }
 176 }
 177 
 178 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 179 // inputs: retry_count_Reg
 180 //       : abort_status_Reg
 181 // output: retry_count_Reg decremented by 1
 182 // flags are killed
 183 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 184   Label doneRetry;
 185   assert(abort_status_Reg == rax, "");
 186   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 187   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 188   // if reason is in 0x6 and retry count != 0 then retry
 189   andptr(abort_status_Reg, 0x6);
 190   jccb(Assembler::zero, doneRetry);
 191   testl(retry_count_Reg, retry_count_Reg);
 192   jccb(Assembler::zero, doneRetry);
 193   pause();
 194   decrementl(retry_count_Reg);
 195   jmp(retryLabel);
 196   bind(doneRetry);
 197 }
 198 
 199 // Spin and retry if lock is busy,
 200 // inputs: box_Reg (monitor address)
 201 //       : retry_count_Reg
 202 // output: retry_count_Reg decremented by 1
 203 //       : clear z flag if retry count exceeded
 204 // tmp_Reg, scr_Reg, flags are killed
 205 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 206                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 207   Label SpinLoop, SpinExit, doneRetry;
 208   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 209 
 210   testl(retry_count_Reg, retry_count_Reg);
 211   jccb(Assembler::zero, doneRetry);
 212   decrementl(retry_count_Reg);
 213   movptr(scr_Reg, RTMSpinLoopCount);
 214 
 215   bind(SpinLoop);
 216   pause();
 217   decrementl(scr_Reg);
 218   jccb(Assembler::lessEqual, SpinExit);
 219   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 220   testptr(tmp_Reg, tmp_Reg);
 221   jccb(Assembler::notZero, SpinLoop);
 222 
 223   bind(SpinExit);
 224   jmp(retryLabel);
 225   bind(doneRetry);
 226   incrementl(retry_count_Reg); // clear z flag
 227 }
 228 
 229 // Use RTM for normal stack locks
 230 // Input: objReg (object to lock)
 231 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 232                                          Register retry_on_abort_count_Reg,
 233                                          RTMLockingCounters* stack_rtm_counters,
 234                                          Metadata* method_data, bool profile_rtm,
 235                                          Label& DONE_LABEL, Label& IsInflated) {
 236   assert(UseRTMForStackLocks, "why call this otherwise?");
 237   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
 238   assert(tmpReg == rax, "");
 239   assert(scrReg == rdx, "");
 240   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 241 
 242   if (RTMRetryCount > 0) {
 243     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 244     bind(L_rtm_retry);
 245   }
 246   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 247   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral|biased
 248   jcc(Assembler::notZero, IsInflated);
 249 
 250   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 251     Label L_noincrement;
 252     if (RTMTotalCountIncrRate > 1) {
 253       // tmpReg, scrReg and flags are killed
 254       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 255     }
 256     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
 257     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 258     bind(L_noincrement);
 259   }
 260   xbegin(L_on_abort);
 261   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 262   andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits
 263   cmpptr(tmpReg, markWord::unlocked_value);            // bits = 001 unlocked
 264   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 265 
 266   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 267   if (UseRTMXendForLockBusy) {
 268     xend();
 269     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 270     jmp(L_decrement_retry);
 271   }
 272   else {
 273     xabort(0);
 274   }
 275   bind(L_on_abort);
 276   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 277     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 278   }
 279   bind(L_decrement_retry);
 280   if (RTMRetryCount > 0) {
 281     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 282     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 283   }
 284 }
 285 
 286 // Use RTM for inflating locks
 287 // inputs: objReg (object to lock)
 288 //         boxReg (on-stack box address (displaced header location) - KILLED)
 289 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 290 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 291                                             Register scrReg, Register retry_on_busy_count_Reg,
 292                                             Register retry_on_abort_count_Reg,
 293                                             RTMLockingCounters* rtm_counters,
 294                                             Metadata* method_data, bool profile_rtm,
 295                                             Label& DONE_LABEL) {
 296   assert(UseRTMLocking, "why call this otherwise?");
 297   assert(tmpReg == rax, "");
 298   assert(scrReg == rdx, "");
 299   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 300   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 301 
 302   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 303   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
 304   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 305 
 306   if (RTMRetryCount > 0) {
 307     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 308     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 309     bind(L_rtm_retry);
 310   }
 311   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 312     Label L_noincrement;
 313     if (RTMTotalCountIncrRate > 1) {
 314       // tmpReg, scrReg and flags are killed
 315       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 316     }
 317     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 318     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 319     bind(L_noincrement);
 320   }
 321   xbegin(L_on_abort);
 322   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 323   movptr(tmpReg, Address(tmpReg, owner_offset));
 324   testptr(tmpReg, tmpReg);
 325   jcc(Assembler::zero, DONE_LABEL);
 326   if (UseRTMXendForLockBusy) {
 327     xend();
 328     jmp(L_decrement_retry);
 329   }
 330   else {
 331     xabort(0);
 332   }
 333   bind(L_on_abort);
 334   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 335   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 336     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 337   }
 338   if (RTMRetryCount > 0) {
 339     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 340     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 341   }
 342 
 343   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 344   testptr(tmpReg, tmpReg) ;
 345   jccb(Assembler::notZero, L_decrement_retry) ;
 346 
 347   // Appears unlocked - try to swing _owner from null to non-null.
 348   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 349 #ifdef _LP64
 350   Register threadReg = r15_thread;
 351 #else
 352   get_thread(scrReg);
 353   Register threadReg = scrReg;
 354 #endif
 355   lock();
 356   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 357 
 358   if (RTMRetryCount > 0) {
 359     // success done else retry
 360     jccb(Assembler::equal, DONE_LABEL) ;
 361     bind(L_decrement_retry);
 362     // Spin and retry if lock is busy.
 363     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 364   }
 365   else {
 366     bind(L_decrement_retry);
 367   }
 368 }
 369 
 370 #endif //  INCLUDE_RTM_OPT
 371 
 372 // fast_lock and fast_unlock used by C2
 373 
 374 // Because the transitions from emitted code to the runtime
 375 // monitorenter/exit helper stubs are so slow it's critical that
 376 // we inline both the stack-locking fast path and the inflated fast path.
 377 //
 378 // See also: cmpFastLock and cmpFastUnlock.
 379 //
 380 // What follows is a specialized inline transliteration of the code
 381 // in enter() and exit(). If we're concerned about I$ bloat another
 382 // option would be to emit TrySlowEnter and TrySlowExit methods
 383 // at startup-time.  These methods would accept arguments as
 384 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 385 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 386 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 387 // In practice, however, the # of lock sites is bounded and is usually small.
 388 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 389 // if the processor uses simple bimodal branch predictors keyed by EIP
 390 // Since the helper routines would be called from multiple synchronization
 391 // sites.
 392 //
 393 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 394 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 395 // to those specialized methods.  That'd give us a mostly platform-independent
 396 // implementation that the JITs could optimize and inline at their pleasure.
 397 // Done correctly, the only time we'd need to cross to native could would be
 398 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 399 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 400 // (b) explicit barriers or fence operations.
 401 //
 402 // TODO:
 403 //
 404 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 405 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 406 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 407 //    the lock operators would typically be faster than reifying Self.
 408 //
 409 // *  Ideally I'd define the primitives as:
 410 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 411 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 412 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 413 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 414 //    Furthermore the register assignments are overconstrained, possibly resulting in
 415 //    sub-optimal code near the synchronization site.
 416 //
 417 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 418 //    Alternately, use a better sp-proximity test.
 419 //
 420 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 421 //    Either one is sufficient to uniquely identify a thread.
 422 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 423 //
 424 // *  Intrinsify notify() and notifyAll() for the common cases where the
 425 //    object is locked by the calling thread but the waitlist is empty.
 426 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 427 //
 428 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 429 //    But beware of excessive branch density on AMD Opterons.
 430 //
 431 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 432 //    or failure of the fast path.  If the fast path fails then we pass
 433 //    control to the slow path, typically in C.  In fast_lock and
 434 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 435 //    will emit a conditional branch immediately after the node.
 436 //    So we have branches to branches and lots of ICC.ZF games.
 437 //    Instead, it might be better to have C2 pass a "FailureLabel"
 438 //    into fast_lock and fast_unlock.  In the case of success, control
 439 //    will drop through the node.  ICC.ZF is undefined at exit.
 440 //    In the case of failure, the node will branch directly to the
 441 //    FailureLabel
 442 
 443 
 444 // obj: object to lock
 445 // box: on-stack box address (displaced header location) - KILLED
 446 // rax,: tmp -- KILLED
 447 // scr: tmp -- KILLED
 448 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 449                                  Register scrReg, Register cx1Reg, Register cx2Reg,
 450                                  BiasedLockingCounters* counters,
 451                                  RTMLockingCounters* rtm_counters,
 452                                  RTMLockingCounters* stack_rtm_counters,
 453                                  Metadata* method_data,
 454                                  bool use_rtm, bool profile_rtm) {
 455   // Ensure the register assignments are disjoint
 456   assert(tmpReg == rax, "");
 457 
 458   if (use_rtm) {
 459     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 460   } else {
 461     assert(cx2Reg == noreg, "");
 462     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 463   }
 464 
 465   if (counters != NULL) {
 466     atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
 467   }
 468 
 469   // Possible cases that we'll encounter in fast_lock
 470   // ------------------------------------------------
 471   // * Inflated
 472   //    -- unlocked
 473   //    -- Locked
 474   //       = by self
 475   //       = by other
 476   // * biased
 477   //    -- by Self
 478   //    -- by other
 479   // * neutral
 480   // * stack-locked
 481   //    -- by self
 482   //       = sp-proximity test hits
 483   //       = sp-proximity test generates false-negative
 484   //    -- by other
 485   //
 486 
 487   Label IsInflated, DONE_LABEL;
 488 
 489   if (DiagnoseSyncOnValueBasedClasses != 0) {
 490     load_klass(tmpReg, objReg, cx1Reg);
 491     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 492     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 493     jcc(Assembler::notZero, DONE_LABEL);
 494   }
 495 
 496   // it's stack-locked, biased or neutral
 497   // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
 498   // order to reduce the number of conditional branches in the most common cases.
 499   // Beware -- there's a subtle invariant that fetch of the markword
 500   // at [FETCH], below, will never observe a biased encoding (*101b).
 501   // If this invariant is not held we risk exclusion (safety) failure.
 502   if (UseBiasedLocking && !UseOptoBiasInlining) {
 503     biased_locking_enter(boxReg, objReg, tmpReg, scrReg, cx1Reg, false, DONE_LABEL, NULL, counters);
 504   }
 505 
 506 #if INCLUDE_RTM_OPT
 507   if (UseRTMForStackLocks && use_rtm) {
 508     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 509                       stack_rtm_counters, method_data, profile_rtm,
 510                       DONE_LABEL, IsInflated);
 511   }
 512 #endif // INCLUDE_RTM_OPT
 513 
 514   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 515   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased
 516   jccb(Assembler::notZero, IsInflated);
 517 
 518   // Attempt stack-locking ...
 519   orptr (tmpReg, markWord::unlocked_value);
 520   movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 521   lock();
 522   cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 523   if (counters != NULL) {
 524     cond_inc32(Assembler::equal,
 525                ExternalAddress((address)counters->fast_path_entry_count_addr()));
 526   }
 527   jcc(Assembler::equal, DONE_LABEL);           // Success
 528 
 529   // Recursive locking.
 530   // The object is stack-locked: markword contains stack pointer to BasicLock.
 531   // Locked by current thread if difference with current SP is less than one page.
 532   subptr(tmpReg, rsp);
 533   // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 534   andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
 535   movptr(Address(boxReg, 0), tmpReg);
 536   if (counters != NULL) {
 537     cond_inc32(Assembler::equal,
 538                ExternalAddress((address)counters->fast_path_entry_count_addr()));
 539   }
 540   jmp(DONE_LABEL);
 541 
 542   bind(IsInflated);
 543   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 544 
 545 #if INCLUDE_RTM_OPT
 546   // Use the same RTM locking code in 32- and 64-bit VM.
 547   if (use_rtm) {
 548     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 549                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 550   } else {
 551 #endif // INCLUDE_RTM_OPT
 552 
 553 #ifndef _LP64
 554   // The object is inflated.
 555 
 556   // boxReg refers to the on-stack BasicLock in the current frame.
 557   // We'd like to write:
 558   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 559   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 560   // additional latency as we have another ST in the store buffer that must drain.
 561 
 562   // avoid ST-before-CAS
 563   // register juggle because we need tmpReg for cmpxchgptr below
 564   movptr(scrReg, boxReg);
 565   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 566 
 567   // Optimistic form: consider XORL tmpReg,tmpReg
 568   movptr(tmpReg, NULL_WORD);
 569 
 570   // Appears unlocked - try to swing _owner from null to non-null.
 571   // Ideally, I'd manifest "Self" with get_thread and then attempt
 572   // to CAS the register containing Self into m->Owner.
 573   // But we don't have enough registers, so instead we can either try to CAS
 574   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 575   // we later store "Self" into m->Owner.  Transiently storing a stack address
 576   // (rsp or the address of the box) into  m->owner is harmless.
 577   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 578   lock();
 579   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 580   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 581   // If we weren't able to swing _owner from NULL to the BasicLock
 582   // then take the slow path.
 583   jccb  (Assembler::notZero, DONE_LABEL);
 584   // update _owner from BasicLock to thread
 585   get_thread (scrReg);                    // beware: clobbers ICCs
 586   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 587   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 588 
 589   // If the CAS fails we can either retry or pass control to the slow path.
 590   // We use the latter tactic.
 591   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 592   // If the CAS was successful ...
 593   //   Self has acquired the lock
 594   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 595   // Intentional fall-through into DONE_LABEL ...
 596 #else // _LP64
 597   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 598   movq(scrReg, tmpReg);
 599   xorq(tmpReg, tmpReg);
 600   lock();
 601   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 602   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 603   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 604   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
 605   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 606   jcc(Assembler::equal, DONE_LABEL);           // CAS above succeeded; propagate ZF = 1 (success)
 607 
 608   cmpptr(r15_thread, rax);                     // Check if we are already the owner (recursive lock)
 609   jcc(Assembler::notEqual, DONE_LABEL);        // If not recursive, ZF = 0 at this point (fail)
 610   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 611   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 612 #endif // _LP64
 613 #if INCLUDE_RTM_OPT
 614   } // use_rtm()
 615 #endif
 616   // DONE_LABEL is a hot target - we'd really like to place it at the
 617   // start of cache line by padding with NOPs.
 618   // See the AMD and Intel software optimization manuals for the
 619   // most efficient "long" NOP encodings.
 620   // Unfortunately none of our alignment mechanisms suffice.
 621   bind(DONE_LABEL);
 622 
 623   // At DONE_LABEL the icc ZFlag is set as follows ...
 624   // fast_unlock uses the same protocol.
 625   // ZFlag == 1 -> Success
 626   // ZFlag == 0 -> Failure - force control through the slow path
 627 }
 628 
 629 // obj: object to unlock
 630 // box: box address (displaced header location), killed.  Must be EAX.
 631 // tmp: killed, cannot be obj nor box.
 632 //
 633 // Some commentary on balanced locking:
 634 //
 635 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 636 // Methods that don't have provably balanced locking are forced to run in the
 637 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 638 // The interpreter provides two properties:
 639 // I1:  At return-time the interpreter automatically and quietly unlocks any
 640 //      objects acquired the current activation (frame).  Recall that the
 641 //      interpreter maintains an on-stack list of locks currently held by
 642 //      a frame.
 643 // I2:  If a method attempts to unlock an object that is not held by the
 644 //      the frame the interpreter throws IMSX.
 645 //
 646 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 647 // B() doesn't have provably balanced locking so it runs in the interpreter.
 648 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 649 // is still locked by A().
 650 //
 651 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 652 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 653 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 654 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 655 // Arguably given that the spec legislates the JNI case as undefined our implementation
 656 // could reasonably *avoid* checking owner in fast_unlock().
 657 // In the interest of performance we elide m->Owner==Self check in unlock.
 658 // A perfectly viable alternative is to elide the owner check except when
 659 // Xcheck:jni is enabled.
 660 
 661 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 662   assert(boxReg == rax, "");
 663   assert_different_registers(objReg, boxReg, tmpReg);
 664 
 665   Label DONE_LABEL, Stacked, CheckSucc;
 666 
 667   // Critically, the biased locking test must have precedence over
 668   // and appear before the (box->dhw == 0) recursive stack-lock test.
 669   if (UseBiasedLocking && !UseOptoBiasInlining) {
 670     biased_locking_exit(objReg, tmpReg, DONE_LABEL);
 671   }
 672 
 673 #if INCLUDE_RTM_OPT
 674   if (UseRTMForStackLocks && use_rtm) {
 675     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
 676     Label L_regular_unlock;
 677     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 678     andptr(tmpReg, markWord::biased_lock_mask_in_place);              // look at 3 lock bits
 679     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 001 unlocked
 680     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 681     xend();                                                           // otherwise end...
 682     jmp(DONE_LABEL);                                                  // ... and we're done
 683     bind(L_regular_unlock);
 684   }
 685 #endif
 686 
 687   cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD);                   // Examine the displaced header
 688   jcc   (Assembler::zero, DONE_LABEL);                              // 0 indicates recursive stack-lock
 689   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
 690   testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 691   jccb  (Assembler::zero, Stacked);
 692 
 693   // It's inflated.
 694 #if INCLUDE_RTM_OPT
 695   if (use_rtm) {
 696     Label L_regular_inflated_unlock;
 697     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 698     movptr(boxReg, Address(tmpReg, owner_offset));
 699     testptr(boxReg, boxReg);
 700     jccb(Assembler::notZero, L_regular_inflated_unlock);
 701     xend();
 702     jmpb(DONE_LABEL);
 703     bind(L_regular_inflated_unlock);
 704   }
 705 #endif
 706 
 707   // Despite our balanced locking property we still check that m->_owner == Self
 708   // as java routines or native JNI code called by this thread might
 709   // have released the lock.
 710   // Refer to the comments in synchronizer.cpp for how we might encode extra
 711   // state in _succ so we can avoid fetching EntryList|cxq.
 712   //
 713   // If there's no contention try a 1-0 exit.  That is, exit without
 714   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 715   // we detect and recover from the race that the 1-0 exit admits.
 716   //
 717   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 718   // before it STs null into _owner, releasing the lock.  Updates
 719   // to data protected by the critical section must be visible before
 720   // we drop the lock (and thus before any other thread could acquire
 721   // the lock and observe the fields protected by the lock).
 722   // IA32's memory-model is SPO, so STs are ordered with respect to
 723   // each other and there's no need for an explicit barrier (fence).
 724   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 725 #ifndef _LP64
 726   get_thread (boxReg);
 727 
 728   // Note that we could employ various encoding schemes to reduce
 729   // the number of loads below (currently 4) to just 2 or 3.
 730   // Refer to the comments in synchronizer.cpp.
 731   // In practice the chain of fetches doesn't seem to impact performance, however.
 732   xorptr(boxReg, boxReg);
 733   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 734   jccb  (Assembler::notZero, DONE_LABEL);
 735   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 736   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 737   jccb  (Assembler::notZero, CheckSucc);
 738   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 739   jmpb  (DONE_LABEL);
 740 
 741   bind (Stacked);
 742   // It's not inflated and it's not recursively stack-locked and it's not biased.
 743   // It must be stack-locked.
 744   // Try to reset the header to displaced header.
 745   // The "box" value on the stack is stable, so we can reload
 746   // and be assured we observe the same value as above.
 747   movptr(tmpReg, Address(boxReg, 0));
 748   lock();
 749   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 750   // Intention fall-thru into DONE_LABEL
 751 
 752   // DONE_LABEL is a hot target - we'd really like to place it at the
 753   // start of cache line by padding with NOPs.
 754   // See the AMD and Intel software optimization manuals for the
 755   // most efficient "long" NOP encodings.
 756   // Unfortunately none of our alignment mechanisms suffice.
 757   bind (CheckSucc);
 758 #else // _LP64
 759   // It's inflated
 760   Label LNotRecursive, LSuccess, LGoSlowPath;
 761 
 762   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 763   jccb(Assembler::equal, LNotRecursive);
 764 
 765   // Recursive inflated unlock
 766   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 767   jmpb(LSuccess);
 768 
 769   bind(LNotRecursive);
 770   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 771   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 772   jccb  (Assembler::notZero, CheckSucc);
 773   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 774   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 775   jmpb  (DONE_LABEL);
 776 
 777   // Try to avoid passing control into the slow_path ...
 778   bind  (CheckSucc);
 779 
 780   // The following optional optimization can be elided if necessary
 781   // Effectively: if (succ == null) goto slow path
 782   // The code reduces the window for a race, however,
 783   // and thus benefits performance.
 784   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 785   jccb  (Assembler::zero, LGoSlowPath);
 786 
 787   xorptr(boxReg, boxReg);
 788   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 789   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 790 
 791   // Memory barrier/fence
 792   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 793   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 794   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 795   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 796   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 797   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 798   lock(); addl(Address(rsp, 0), 0);
 799 
 800   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 801   jccb  (Assembler::notZero, LSuccess);
 802 
 803   // Rare inopportune interleaving - race.
 804   // The successor vanished in the small window above.
 805   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 806   // We need to ensure progress and succession.
 807   // Try to reacquire the lock.
 808   // If that fails then the new owner is responsible for succession and this
 809   // thread needs to take no further action and can exit via the fast path (success).
 810   // If the re-acquire succeeds then pass control into the slow path.
 811   // As implemented, this latter mode is horrible because we generated more
 812   // coherence traffic on the lock *and* artifically extended the critical section
 813   // length while by virtue of passing control into the slow path.
 814 
 815   // box is really RAX -- the following CMPXCHG depends on that binding
 816   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 817   lock();
 818   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 819   // There's no successor so we tried to regrab the lock.
 820   // If that didn't work, then another thread grabbed the
 821   // lock so we're done (and exit was a success).
 822   jccb  (Assembler::notEqual, LSuccess);
 823   // Intentional fall-through into slow path
 824 
 825   bind  (LGoSlowPath);
 826   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 827   jmpb  (DONE_LABEL);
 828 
 829   bind  (LSuccess);
 830   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 831   jmpb  (DONE_LABEL);
 832 
 833   bind  (Stacked);
 834   movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 835   lock();
 836   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 837 
 838 #endif
 839   bind(DONE_LABEL);
 840 }
 841 
 842 //-------------------------------------------------------------------------------------------
 843 // Generic instructions support for use in .ad files C2 code generation
 844 
 845 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
 846   if (dst != src) {
 847     movdqu(dst, src);
 848   }
 849   if (opcode == Op_AbsVD) {
 850     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr);
 851   } else {
 852     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 853     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr);
 854   }
 855 }
 856 
 857 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
 858   if (opcode == Op_AbsVD) {
 859     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr);
 860   } else {
 861     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 862     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr);
 863   }
 864 }
 865 
 866 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
 867   if (dst != src) {
 868     movdqu(dst, src);
 869   }
 870   if (opcode == Op_AbsVF) {
 871     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);
 872   } else {
 873     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 874     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);
 875   }
 876 }
 877 
 878 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
 879   if (opcode == Op_AbsVF) {
 880     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);
 881   } else {
 882     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 883     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
 884   }
 885 }
 886 
 887 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 888   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 889   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 890 
 891   if (opcode == Op_MinV) {
 892     if (elem_bt == T_BYTE) {
 893       pminsb(dst, src);
 894     } else if (elem_bt == T_SHORT) {
 895       pminsw(dst, src);
 896     } else if (elem_bt == T_INT) {
 897       pminsd(dst, src);
 898     } else {
 899       assert(elem_bt == T_LONG, "required");
 900       assert(tmp == xmm0, "required");
 901       assert_different_registers(dst, src, tmp);
 902       movdqu(xmm0, dst);
 903       pcmpgtq(xmm0, src);
 904       blendvpd(dst, src);  // xmm0 as mask
 905     }
 906   } else { // opcode == Op_MaxV
 907     if (elem_bt == T_BYTE) {
 908       pmaxsb(dst, src);
 909     } else if (elem_bt == T_SHORT) {
 910       pmaxsw(dst, src);
 911     } else if (elem_bt == T_INT) {
 912       pmaxsd(dst, src);
 913     } else {
 914       assert(elem_bt == T_LONG, "required");
 915       assert(tmp == xmm0, "required");
 916       assert_different_registers(dst, src, tmp);
 917       movdqu(xmm0, src);
 918       pcmpgtq(xmm0, dst);
 919       blendvpd(dst, src);  // xmm0 as mask
 920     }
 921   }
 922 }
 923 
 924 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 925                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 926                                  int vlen_enc) {
 927   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 928 
 929   if (opcode == Op_MinV) {
 930     if (elem_bt == T_BYTE) {
 931       vpminsb(dst, src1, src2, vlen_enc);
 932     } else if (elem_bt == T_SHORT) {
 933       vpminsw(dst, src1, src2, vlen_enc);
 934     } else if (elem_bt == T_INT) {
 935       vpminsd(dst, src1, src2, vlen_enc);
 936     } else {
 937       assert(elem_bt == T_LONG, "required");
 938       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 939         vpminsq(dst, src1, src2, vlen_enc);
 940       } else {
 941         assert_different_registers(dst, src1, src2);
 942         vpcmpgtq(dst, src1, src2, vlen_enc);
 943         vblendvpd(dst, src1, src2, dst, vlen_enc);
 944       }
 945     }
 946   } else { // opcode == Op_MaxV
 947     if (elem_bt == T_BYTE) {
 948       vpmaxsb(dst, src1, src2, vlen_enc);
 949     } else if (elem_bt == T_SHORT) {
 950       vpmaxsw(dst, src1, src2, vlen_enc);
 951     } else if (elem_bt == T_INT) {
 952       vpmaxsd(dst, src1, src2, vlen_enc);
 953     } else {
 954       assert(elem_bt == T_LONG, "required");
 955       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 956         vpmaxsq(dst, src1, src2, vlen_enc);
 957       } else {
 958         assert_different_registers(dst, src1, src2);
 959         vpcmpgtq(dst, src1, src2, vlen_enc);
 960         vblendvpd(dst, src2, src1, dst, vlen_enc);
 961       }
 962     }
 963   }
 964 }
 965 
 966 // Float/Double min max
 967 
 968 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 969                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 970                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 971                                    int vlen_enc) {
 972   assert(UseAVX > 0, "required");
 973   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 974          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 975   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 976   assert_different_registers(a, b, tmp, atmp, btmp);
 977 
 978   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 979   bool is_double_word = is_double_word_type(elem_bt);
 980 
 981   if (!is_double_word && is_min) {
 982     vblendvps(atmp, a, b, a, vlen_enc);
 983     vblendvps(btmp, b, a, a, vlen_enc);
 984     vminps(tmp, atmp, btmp, vlen_enc);
 985     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 986     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
 987   } else if (!is_double_word && !is_min) {
 988     vblendvps(btmp, b, a, b, vlen_enc);
 989     vblendvps(atmp, a, b, b, vlen_enc);
 990     vmaxps(tmp, atmp, btmp, vlen_enc);
 991     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 992     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
 993   } else if (is_double_word && is_min) {
 994     vblendvpd(atmp, a, b, a, vlen_enc);
 995     vblendvpd(btmp, b, a, a, vlen_enc);
 996     vminpd(tmp, atmp, btmp, vlen_enc);
 997     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 998     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
 999   } else {
1000     assert(is_double_word && !is_min, "sanity");
1001     vblendvpd(btmp, b, a, b, vlen_enc);
1002     vblendvpd(atmp, a, b, b, vlen_enc);
1003     vmaxpd(tmp, atmp, btmp, vlen_enc);
1004     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1005     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1006   }
1007 }
1008 
1009 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1010                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1011                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1012                                     int vlen_enc) {
1013   assert(UseAVX > 2, "required");
1014   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1015          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1016   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1017   assert_different_registers(dst, a, b, atmp, btmp);
1018 
1019   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1020   bool is_double_word = is_double_word_type(elem_bt);
1021   bool merge = true;
1022 
1023   if (!is_double_word && is_min) {
1024     evpmovd2m(ktmp, a, vlen_enc);
1025     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1026     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1027     vminps(dst, atmp, btmp, vlen_enc);
1028     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1029     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1030   } else if (!is_double_word && !is_min) {
1031     evpmovd2m(ktmp, b, vlen_enc);
1032     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1033     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1034     vmaxps(dst, atmp, btmp, vlen_enc);
1035     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1036     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1037   } else if (is_double_word && is_min) {
1038     evpmovq2m(ktmp, a, vlen_enc);
1039     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1040     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1041     vminpd(dst, atmp, btmp, vlen_enc);
1042     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1043     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1044   } else {
1045     assert(is_double_word && !is_min, "sanity");
1046     evpmovq2m(ktmp, b, vlen_enc);
1047     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1048     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1049     vmaxpd(dst, atmp, btmp, vlen_enc);
1050     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1051     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1052   }
1053 }
1054 
1055 // Float/Double signum
1056 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst,
1057                                   XMMRegister zero, XMMRegister one,
1058                                   Register scratch) {
1059   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1060 
1061   Label DONE_LABEL;
1062 
1063   if (opcode == Op_SignumF) {
1064     assert(UseSSE > 0, "required");
1065     ucomiss(dst, zero);
1066     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1067     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1068     movflt(dst, one);
1069     jcc(Assembler::above, DONE_LABEL);
1070     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scratch);
1071   } else if (opcode == Op_SignumD) {
1072     assert(UseSSE > 1, "required");
1073     ucomisd(dst, zero);
1074     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1075     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1076     movdbl(dst, one);
1077     jcc(Assembler::above, DONE_LABEL);
1078     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scratch);
1079   }
1080 
1081   bind(DONE_LABEL);
1082 }
1083 
1084 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1085   if (sign) {
1086     pmovsxbw(dst, src);
1087   } else {
1088     pmovzxbw(dst, src);
1089   }
1090 }
1091 
1092 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1093   if (sign) {
1094     vpmovsxbw(dst, src, vector_len);
1095   } else {
1096     vpmovzxbw(dst, src, vector_len);
1097   }
1098 }
1099 
1100 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1101   if (sign) {
1102     vpmovsxbd(dst, src, vector_len);
1103   } else {
1104     vpmovzxbd(dst, src, vector_len);
1105   }
1106 }
1107 
1108 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1109   if (sign) {
1110     vpmovsxwd(dst, src, vector_len);
1111   } else {
1112     vpmovzxwd(dst, src, vector_len);
1113   }
1114 }
1115 
1116 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1117                                      int shift, int vector_len) {
1118   if (opcode == Op_RotateLeftV) {
1119     if (etype == T_INT) {
1120       evprold(dst, src, shift, vector_len);
1121     } else {
1122       assert(etype == T_LONG, "expected type T_LONG");
1123       evprolq(dst, src, shift, vector_len);
1124     }
1125   } else {
1126     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1127     if (etype == T_INT) {
1128       evprord(dst, src, shift, vector_len);
1129     } else {
1130       assert(etype == T_LONG, "expected type T_LONG");
1131       evprorq(dst, src, shift, vector_len);
1132     }
1133   }
1134 }
1135 
1136 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1137                                      XMMRegister shift, int vector_len) {
1138   if (opcode == Op_RotateLeftV) {
1139     if (etype == T_INT) {
1140       evprolvd(dst, src, shift, vector_len);
1141     } else {
1142       assert(etype == T_LONG, "expected type T_LONG");
1143       evprolvq(dst, src, shift, vector_len);
1144     }
1145   } else {
1146     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1147     if (etype == T_INT) {
1148       evprorvd(dst, src, shift, vector_len);
1149     } else {
1150       assert(etype == T_LONG, "expected type T_LONG");
1151       evprorvq(dst, src, shift, vector_len);
1152     }
1153   }
1154 }
1155 
1156 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1157   if (opcode == Op_RShiftVI) {
1158     psrad(dst, shift);
1159   } else if (opcode == Op_LShiftVI) {
1160     pslld(dst, shift);
1161   } else {
1162     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1163     psrld(dst, shift);
1164   }
1165 }
1166 
1167 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1168   switch (opcode) {
1169     case Op_RShiftVI:  psrad(dst, shift); break;
1170     case Op_LShiftVI:  pslld(dst, shift); break;
1171     case Op_URShiftVI: psrld(dst, shift); break;
1172 
1173     default: assert(false, "%s", NodeClassNames[opcode]);
1174   }
1175 }
1176 
1177 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1178   if (opcode == Op_RShiftVI) {
1179     vpsrad(dst, nds, shift, vector_len);
1180   } else if (opcode == Op_LShiftVI) {
1181     vpslld(dst, nds, shift, vector_len);
1182   } else {
1183     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1184     vpsrld(dst, nds, shift, vector_len);
1185   }
1186 }
1187 
1188 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1189   switch (opcode) {
1190     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1191     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1192     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1193 
1194     default: assert(false, "%s", NodeClassNames[opcode]);
1195   }
1196 }
1197 
1198 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1199   switch (opcode) {
1200     case Op_RShiftVB:  // fall-through
1201     case Op_RShiftVS:  psraw(dst, shift); break;
1202 
1203     case Op_LShiftVB:  // fall-through
1204     case Op_LShiftVS:  psllw(dst, shift);   break;
1205 
1206     case Op_URShiftVS: // fall-through
1207     case Op_URShiftVB: psrlw(dst, shift);  break;
1208 
1209     default: assert(false, "%s", NodeClassNames[opcode]);
1210   }
1211 }
1212 
1213 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1214   switch (opcode) {
1215     case Op_RShiftVB:  // fall-through
1216     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1217 
1218     case Op_LShiftVB:  // fall-through
1219     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1220 
1221     case Op_URShiftVS: // fall-through
1222     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1223 
1224     default: assert(false, "%s", NodeClassNames[opcode]);
1225   }
1226 }
1227 
1228 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1229   switch (opcode) {
1230     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1231     case Op_LShiftVL:  psllq(dst, shift); break;
1232     case Op_URShiftVL: psrlq(dst, shift); break;
1233 
1234     default: assert(false, "%s", NodeClassNames[opcode]);
1235   }
1236 }
1237 
1238 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1239   if (opcode == Op_RShiftVL) {
1240     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1241   } else if (opcode == Op_LShiftVL) {
1242     psllq(dst, shift);
1243   } else {
1244     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1245     psrlq(dst, shift);
1246   }
1247 }
1248 
1249 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1250   switch (opcode) {
1251     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1252     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1253     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1254 
1255     default: assert(false, "%s", NodeClassNames[opcode]);
1256   }
1257 }
1258 
1259 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1260   if (opcode == Op_RShiftVL) {
1261     evpsraq(dst, nds, shift, vector_len);
1262   } else if (opcode == Op_LShiftVL) {
1263     vpsllq(dst, nds, shift, vector_len);
1264   } else {
1265     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1266     vpsrlq(dst, nds, shift, vector_len);
1267   }
1268 }
1269 
1270 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1271   switch (opcode) {
1272     case Op_RShiftVB:  // fall-through
1273     case Op_RShiftVS:  // fall-through
1274     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1275 
1276     case Op_LShiftVB:  // fall-through
1277     case Op_LShiftVS:  // fall-through
1278     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1279 
1280     case Op_URShiftVB: // fall-through
1281     case Op_URShiftVS: // fall-through
1282     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1283 
1284     default: assert(false, "%s", NodeClassNames[opcode]);
1285   }
1286 }
1287 
1288 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1289   switch (opcode) {
1290     case Op_RShiftVB:  // fall-through
1291     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1292 
1293     case Op_LShiftVB:  // fall-through
1294     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1295 
1296     case Op_URShiftVB: // fall-through
1297     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1298 
1299     default: assert(false, "%s", NodeClassNames[opcode]);
1300   }
1301 }
1302 
1303 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1304   assert(UseAVX >= 2, "required");
1305   switch (opcode) {
1306     case Op_RShiftVL: {
1307       if (UseAVX > 2) {
1308         assert(tmp == xnoreg, "not used");
1309         if (!VM_Version::supports_avx512vl()) {
1310           vlen_enc = Assembler::AVX_512bit;
1311         }
1312         evpsravq(dst, src, shift, vlen_enc);
1313       } else {
1314         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1315         vpsrlvq(dst, src, shift, vlen_enc);
1316         vpsrlvq(tmp, tmp, shift, vlen_enc);
1317         vpxor(dst, dst, tmp, vlen_enc);
1318         vpsubq(dst, dst, tmp, vlen_enc);
1319       }
1320       break;
1321     }
1322     case Op_LShiftVL: {
1323       assert(tmp == xnoreg, "not used");
1324       vpsllvq(dst, src, shift, vlen_enc);
1325       break;
1326     }
1327     case Op_URShiftVL: {
1328       assert(tmp == xnoreg, "not used");
1329       vpsrlvq(dst, src, shift, vlen_enc);
1330       break;
1331     }
1332     default: assert(false, "%s", NodeClassNames[opcode]);
1333   }
1334 }
1335 
1336 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1337 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1338   assert(opcode == Op_LShiftVB ||
1339          opcode == Op_RShiftVB ||
1340          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1341   bool sign = (opcode != Op_URShiftVB);
1342   assert(vector_len == 0, "required");
1343   vextendbd(sign, dst, src, 1);
1344   vpmovzxbd(vtmp, shift, 1);
1345   varshiftd(opcode, dst, dst, vtmp, 1);
1346   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch);
1347   vextracti128_high(vtmp, dst);
1348   vpackusdw(dst, dst, vtmp, 0);
1349 }
1350 
1351 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1352 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1353   assert(opcode == Op_LShiftVB ||
1354          opcode == Op_RShiftVB ||
1355          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1356   bool sign = (opcode != Op_URShiftVB);
1357   int ext_vector_len = vector_len + 1;
1358   vextendbw(sign, dst, src, ext_vector_len);
1359   vpmovzxbw(vtmp, shift, ext_vector_len);
1360   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1361   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch);
1362   if (vector_len == 0) {
1363     vextracti128_high(vtmp, dst);
1364     vpackuswb(dst, dst, vtmp, vector_len);
1365   } else {
1366     vextracti64x4_high(vtmp, dst);
1367     vpackuswb(dst, dst, vtmp, vector_len);
1368     vpermq(dst, dst, 0xD8, vector_len);
1369   }
1370 }
1371 
1372 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1373   switch(typ) {
1374     case T_BYTE:
1375       pinsrb(dst, val, idx);
1376       break;
1377     case T_SHORT:
1378       pinsrw(dst, val, idx);
1379       break;
1380     case T_INT:
1381       pinsrd(dst, val, idx);
1382       break;
1383     case T_LONG:
1384       pinsrq(dst, val, idx);
1385       break;
1386     default:
1387       assert(false,"Should not reach here.");
1388       break;
1389   }
1390 }
1391 
1392 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1393   switch(typ) {
1394     case T_BYTE:
1395       vpinsrb(dst, src, val, idx);
1396       break;
1397     case T_SHORT:
1398       vpinsrw(dst, src, val, idx);
1399       break;
1400     case T_INT:
1401       vpinsrd(dst, src, val, idx);
1402       break;
1403     case T_LONG:
1404       vpinsrq(dst, src, val, idx);
1405       break;
1406     default:
1407       assert(false,"Should not reach here.");
1408       break;
1409   }
1410 }
1411 
1412 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1413   switch(typ) {
1414     case T_INT:
1415       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1416       break;
1417     case T_FLOAT:
1418       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1419       break;
1420     case T_LONG:
1421       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1422       break;
1423     case T_DOUBLE:
1424       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1425       break;
1426     default:
1427       assert(false,"Should not reach here.");
1428       break;
1429   }
1430 }
1431 
1432 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1433   switch(typ) {
1434     case T_INT:
1435       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1436       break;
1437     case T_FLOAT:
1438       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1439       break;
1440     case T_LONG:
1441       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1442       break;
1443     case T_DOUBLE:
1444       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1445       break;
1446     default:
1447       assert(false,"Should not reach here.");
1448       break;
1449   }
1450 }
1451 
1452 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1453   switch(typ) {
1454     case T_INT:
1455       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1456       break;
1457     case T_FLOAT:
1458       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1459       break;
1460     case T_LONG:
1461       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1462       break;
1463     case T_DOUBLE:
1464       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1465       break;
1466     default:
1467       assert(false,"Should not reach here.");
1468       break;
1469   }
1470 }
1471 
1472 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1473   if (vlen_in_bytes <= 16) {
1474     pxor (dst, dst);
1475     psubb(dst, src);
1476     switch (elem_bt) {
1477       case T_BYTE:   /* nothing to do */ break;
1478       case T_SHORT:  pmovsxbw(dst, dst); break;
1479       case T_INT:    pmovsxbd(dst, dst); break;
1480       case T_FLOAT:  pmovsxbd(dst, dst); break;
1481       case T_LONG:   pmovsxbq(dst, dst); break;
1482       case T_DOUBLE: pmovsxbq(dst, dst); break;
1483 
1484       default: assert(false, "%s", type2name(elem_bt));
1485     }
1486   } else {
1487     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1488     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1489 
1490     vpxor (dst, dst, dst, vlen_enc);
1491     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1492 
1493     switch (elem_bt) {
1494       case T_BYTE:   /* nothing to do */            break;
1495       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1496       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1497       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1498       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1499       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1500 
1501       default: assert(false, "%s", type2name(elem_bt));
1502     }
1503   }
1504 }
1505 
1506 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) {
1507   ExternalAddress addr(StubRoutines::x86::vector_iota_indices());
1508   if (vlen_in_bytes == 4) {
1509     movdl(dst, addr);
1510   } else if (vlen_in_bytes == 8) {
1511     movq(dst, addr);
1512   } else if (vlen_in_bytes == 16) {
1513     movdqu(dst, addr, scratch);
1514   } else if (vlen_in_bytes == 32) {
1515     vmovdqu(dst, addr, scratch);
1516   } else {
1517     assert(vlen_in_bytes == 64, "%d", vlen_in_bytes);
1518     evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch);
1519   }
1520 }
1521 
1522 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1523 
1524 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1525   int vector_len = Assembler::AVX_128bit;
1526 
1527   switch (opcode) {
1528     case Op_AndReductionV:  pand(dst, src); break;
1529     case Op_OrReductionV:   por (dst, src); break;
1530     case Op_XorReductionV:  pxor(dst, src); break;
1531     case Op_MinReductionV:
1532       switch (typ) {
1533         case T_BYTE:        pminsb(dst, src); break;
1534         case T_SHORT:       pminsw(dst, src); break;
1535         case T_INT:         pminsd(dst, src); break;
1536         case T_LONG:        assert(UseAVX > 2, "required");
1537                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1538         default:            assert(false, "wrong type");
1539       }
1540       break;
1541     case Op_MaxReductionV:
1542       switch (typ) {
1543         case T_BYTE:        pmaxsb(dst, src); break;
1544         case T_SHORT:       pmaxsw(dst, src); break;
1545         case T_INT:         pmaxsd(dst, src); break;
1546         case T_LONG:        assert(UseAVX > 2, "required");
1547                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1548         default:            assert(false, "wrong type");
1549       }
1550       break;
1551     case Op_AddReductionVF: addss(dst, src); break;
1552     case Op_AddReductionVD: addsd(dst, src); break;
1553     case Op_AddReductionVI:
1554       switch (typ) {
1555         case T_BYTE:        paddb(dst, src); break;
1556         case T_SHORT:       paddw(dst, src); break;
1557         case T_INT:         paddd(dst, src); break;
1558         default:            assert(false, "wrong type");
1559       }
1560       break;
1561     case Op_AddReductionVL: paddq(dst, src); break;
1562     case Op_MulReductionVF: mulss(dst, src); break;
1563     case Op_MulReductionVD: mulsd(dst, src); break;
1564     case Op_MulReductionVI:
1565       switch (typ) {
1566         case T_SHORT:       pmullw(dst, src); break;
1567         case T_INT:         pmulld(dst, src); break;
1568         default:            assert(false, "wrong type");
1569       }
1570       break;
1571     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1572                             vpmullq(dst, dst, src, vector_len); break;
1573     default:                assert(false, "wrong opcode");
1574   }
1575 }
1576 
1577 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1578   int vector_len = Assembler::AVX_256bit;
1579 
1580   switch (opcode) {
1581     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1582     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1583     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1584     case Op_MinReductionV:
1585       switch (typ) {
1586         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1587         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1588         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1589         case T_LONG:        assert(UseAVX > 2, "required");
1590                             vpminsq(dst, src1, src2, vector_len); break;
1591         default:            assert(false, "wrong type");
1592       }
1593       break;
1594     case Op_MaxReductionV:
1595       switch (typ) {
1596         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1597         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1598         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1599         case T_LONG:        assert(UseAVX > 2, "required");
1600                             vpmaxsq(dst, src1, src2, vector_len); break;
1601         default:            assert(false, "wrong type");
1602       }
1603       break;
1604     case Op_AddReductionVI:
1605       switch (typ) {
1606         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1607         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1608         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1609         default:            assert(false, "wrong type");
1610       }
1611       break;
1612     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1613     case Op_MulReductionVI:
1614       switch (typ) {
1615         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1616         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1617         default:            assert(false, "wrong type");
1618       }
1619       break;
1620     case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
1621     default:                assert(false, "wrong opcode");
1622   }
1623 }
1624 
1625 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1626                                   XMMRegister dst, XMMRegister src,
1627                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1628   switch (opcode) {
1629     case Op_AddReductionVF:
1630     case Op_MulReductionVF:
1631       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1632       break;
1633 
1634     case Op_AddReductionVD:
1635     case Op_MulReductionVD:
1636       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1637       break;
1638 
1639     default: assert(false, "wrong opcode");
1640   }
1641 }
1642 
1643 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1644                              Register dst, Register src1, XMMRegister src2,
1645                              XMMRegister vtmp1, XMMRegister vtmp2) {
1646   switch (vlen) {
1647     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1648     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1649     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1650     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1651 
1652     default: assert(false, "wrong vector length");
1653   }
1654 }
1655 
1656 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1657                              Register dst, Register src1, XMMRegister src2,
1658                              XMMRegister vtmp1, XMMRegister vtmp2) {
1659   switch (vlen) {
1660     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1661     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1662     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1663     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1664 
1665     default: assert(false, "wrong vector length");
1666   }
1667 }
1668 
1669 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1670                              Register dst, Register src1, XMMRegister src2,
1671                              XMMRegister vtmp1, XMMRegister vtmp2) {
1672   switch (vlen) {
1673     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1674     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1675     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1676     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1677 
1678     default: assert(false, "wrong vector length");
1679   }
1680 }
1681 
1682 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1683                              Register dst, Register src1, XMMRegister src2,
1684                              XMMRegister vtmp1, XMMRegister vtmp2) {
1685   switch (vlen) {
1686     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1687     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1688     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1689     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1690 
1691     default: assert(false, "wrong vector length");
1692   }
1693 }
1694 
1695 #ifdef _LP64
1696 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1697                              Register dst, Register src1, XMMRegister src2,
1698                              XMMRegister vtmp1, XMMRegister vtmp2) {
1699   switch (vlen) {
1700     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1701     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1702     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1703 
1704     default: assert(false, "wrong vector length");
1705   }
1706 }
1707 #endif // _LP64
1708 
1709 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1710   switch (vlen) {
1711     case 2:
1712       assert(vtmp2 == xnoreg, "");
1713       reduce2F(opcode, dst, src, vtmp1);
1714       break;
1715     case 4:
1716       assert(vtmp2 == xnoreg, "");
1717       reduce4F(opcode, dst, src, vtmp1);
1718       break;
1719     case 8:
1720       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1721       break;
1722     case 16:
1723       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1724       break;
1725     default: assert(false, "wrong vector length");
1726   }
1727 }
1728 
1729 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1730   switch (vlen) {
1731     case 2:
1732       assert(vtmp2 == xnoreg, "");
1733       reduce2D(opcode, dst, src, vtmp1);
1734       break;
1735     case 4:
1736       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1737       break;
1738     case 8:
1739       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1740       break;
1741     default: assert(false, "wrong vector length");
1742   }
1743 }
1744 
1745 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1746   if (opcode == Op_AddReductionVI) {
1747     if (vtmp1 != src2) {
1748       movdqu(vtmp1, src2);
1749     }
1750     phaddd(vtmp1, vtmp1);
1751   } else {
1752     pshufd(vtmp1, src2, 0x1);
1753     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1754   }
1755   movdl(vtmp2, src1);
1756   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1757   movdl(dst, vtmp1);
1758 }
1759 
1760 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1761   if (opcode == Op_AddReductionVI) {
1762     if (vtmp1 != src2) {
1763       movdqu(vtmp1, src2);
1764     }
1765     phaddd(vtmp1, src2);
1766     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1767   } else {
1768     pshufd(vtmp2, src2, 0xE);
1769     reduce_operation_128(T_INT, opcode, vtmp2, src2);
1770     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1771   }
1772 }
1773 
1774 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1775   if (opcode == Op_AddReductionVI) {
1776     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1777     vextracti128_high(vtmp2, vtmp1);
1778     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1779     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1780   } else {
1781     vextracti128_high(vtmp1, src2);
1782     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1783     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1784   }
1785 }
1786 
1787 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1788   vextracti64x4_high(vtmp2, src2);
1789   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
1790   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1791 }
1792 
1793 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1794   pshufd(vtmp2, src2, 0x1);
1795   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1796   movdqu(vtmp1, vtmp2);
1797   psrldq(vtmp1, 2);
1798   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1799   movdqu(vtmp2, vtmp1);
1800   psrldq(vtmp2, 1);
1801   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1802   movdl(vtmp2, src1);
1803   pmovsxbd(vtmp1, vtmp1);
1804   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1805   pextrb(dst, vtmp1, 0x0);
1806   movsbl(dst, dst);
1807 }
1808 
1809 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1810   pshufd(vtmp1, src2, 0xE);
1811   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
1812   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1813 }
1814 
1815 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1816   vextracti128_high(vtmp2, src2);
1817   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1818   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1819 }
1820 
1821 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1822   vextracti64x4_high(vtmp1, src2);
1823   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
1824   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1825 }
1826 
1827 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1828   pmovsxbw(vtmp2, src2);
1829   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1830 }
1831 
1832 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1833   if (UseAVX > 1) {
1834     int vector_len = Assembler::AVX_256bit;
1835     vpmovsxbw(vtmp1, src2, vector_len);
1836     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1837   } else {
1838     pmovsxbw(vtmp2, src2);
1839     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1840     pshufd(vtmp2, src2, 0x1);
1841     pmovsxbw(vtmp2, src2);
1842     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1843   }
1844 }
1845 
1846 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1847   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
1848     int vector_len = Assembler::AVX_512bit;
1849     vpmovsxbw(vtmp1, src2, vector_len);
1850     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1851   } else {
1852     assert(UseAVX >= 2,"Should not reach here.");
1853     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
1854     vextracti128_high(vtmp2, src2);
1855     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1856   }
1857 }
1858 
1859 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1860   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
1861   vextracti64x4_high(vtmp2, src2);
1862   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1863 }
1864 
1865 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1866   if (opcode == Op_AddReductionVI) {
1867     if (vtmp1 != src2) {
1868       movdqu(vtmp1, src2);
1869     }
1870     phaddw(vtmp1, vtmp1);
1871     phaddw(vtmp1, vtmp1);
1872   } else {
1873     pshufd(vtmp2, src2, 0x1);
1874     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1875     movdqu(vtmp1, vtmp2);
1876     psrldq(vtmp1, 2);
1877     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
1878   }
1879   movdl(vtmp2, src1);
1880   pmovsxwd(vtmp1, vtmp1);
1881   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1882   pextrw(dst, vtmp1, 0x0);
1883   movswl(dst, dst);
1884 }
1885 
1886 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1887   if (opcode == Op_AddReductionVI) {
1888     if (vtmp1 != src2) {
1889       movdqu(vtmp1, src2);
1890     }
1891     phaddw(vtmp1, src2);
1892   } else {
1893     pshufd(vtmp1, src2, 0xE);
1894     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
1895   }
1896   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1897 }
1898 
1899 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1900   if (opcode == Op_AddReductionVI) {
1901     int vector_len = Assembler::AVX_256bit;
1902     vphaddw(vtmp2, src2, src2, vector_len);
1903     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
1904   } else {
1905     vextracti128_high(vtmp2, src2);
1906     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1907   }
1908   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1909 }
1910 
1911 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1912   int vector_len = Assembler::AVX_256bit;
1913   vextracti64x4_high(vtmp1, src2);
1914   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
1915   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1916 }
1917 
1918 #ifdef _LP64
1919 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1920   pshufd(vtmp2, src2, 0xE);
1921   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
1922   movdq(vtmp1, src1);
1923   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
1924   movdq(dst, vtmp1);
1925 }
1926 
1927 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1928   vextracti128_high(vtmp1, src2);
1929   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
1930   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1931 }
1932 
1933 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1934   vextracti64x4_high(vtmp2, src2);
1935   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
1936   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1937 }
1938 
1939 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
1940   assert(ArrayOperationPartialInlineSize > 0 && ArrayOperationPartialInlineSize <= 64, "invalid");
1941   mov64(temp, -1L);
1942   bzhiq(temp, temp, len);
1943   kmovql(dst, temp);
1944 }
1945 #endif // _LP64
1946 
1947 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1948   reduce_operation_128(T_FLOAT, opcode, dst, src);
1949   pshufd(vtmp, src, 0x1);
1950   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1951 }
1952 
1953 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1954   reduce2F(opcode, dst, src, vtmp);
1955   pshufd(vtmp, src, 0x2);
1956   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1957   pshufd(vtmp, src, 0x3);
1958   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1959 }
1960 
1961 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1962   reduce4F(opcode, dst, src, vtmp2);
1963   vextractf128_high(vtmp2, src);
1964   reduce4F(opcode, dst, vtmp2, vtmp1);
1965 }
1966 
1967 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1968   reduce8F(opcode, dst, src, vtmp1, vtmp2);
1969   vextracti64x4_high(vtmp1, src);
1970   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
1971 }
1972 
1973 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1974   reduce_operation_128(T_DOUBLE, opcode, dst, src);
1975   pshufd(vtmp, src, 0xE);
1976   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
1977 }
1978 
1979 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1980   reduce2D(opcode, dst, src, vtmp2);
1981   vextractf128_high(vtmp2, src);
1982   reduce2D(opcode, dst, vtmp2, vtmp1);
1983 }
1984 
1985 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1986   reduce4D(opcode, dst, src, vtmp1, vtmp2);
1987   vextracti64x4_high(vtmp1, src);
1988   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
1989 }
1990 
1991 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) {
1992   MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
1993 }
1994 
1995 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) {
1996   MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
1997 }
1998 
1999 
2000 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2001                                           XMMRegister dst, XMMRegister src,
2002                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2003                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2004   int permconst[] = {1, 14};
2005   XMMRegister wsrc = src;
2006   XMMRegister wdst = xmm_0;
2007   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2008 
2009   int vlen_enc = Assembler::AVX_128bit;
2010   if (vlen == 16) {
2011     vlen_enc = Assembler::AVX_256bit;
2012   }
2013 
2014   for (int i = log2(vlen) - 1; i >=0; i--) {
2015     if (i == 0 && !is_dst_valid) {
2016       wdst = dst;
2017     }
2018     if (i == 3) {
2019       vextracti64x4_high(wtmp, wsrc);
2020     } else if (i == 2) {
2021       vextracti128_high(wtmp, wsrc);
2022     } else { // i = [0,1]
2023       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2024     }
2025     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2026     wsrc = wdst;
2027     vlen_enc = Assembler::AVX_128bit;
2028   }
2029   if (is_dst_valid) {
2030     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2031   }
2032 }
2033 
2034 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2035                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2036                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2037   XMMRegister wsrc = src;
2038   XMMRegister wdst = xmm_0;
2039   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2040   int vlen_enc = Assembler::AVX_128bit;
2041   if (vlen == 8) {
2042     vlen_enc = Assembler::AVX_256bit;
2043   }
2044   for (int i = log2(vlen) - 1; i >=0; i--) {
2045     if (i == 0 && !is_dst_valid) {
2046       wdst = dst;
2047     }
2048     if (i == 1) {
2049       vextracti128_high(wtmp, wsrc);
2050     } else if (i == 2) {
2051       vextracti64x4_high(wtmp, wsrc);
2052     } else {
2053       assert(i == 0, "%d", i);
2054       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2055     }
2056     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2057     wsrc = wdst;
2058     vlen_enc = Assembler::AVX_128bit;
2059   }
2060   if (is_dst_valid) {
2061     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2062   }
2063 }
2064 
2065 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2066   switch (bt) {
2067     case T_BYTE:  pextrb(dst, src, idx); break;
2068     case T_SHORT: pextrw(dst, src, idx); break;
2069     case T_INT:   pextrd(dst, src, idx); break;
2070     case T_LONG:  pextrq(dst, src, idx); break;
2071 
2072     default:
2073       assert(false,"Should not reach here.");
2074       break;
2075   }
2076 }
2077 
2078 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2079   int esize =  type2aelembytes(typ);
2080   int elem_per_lane = 16/esize;
2081   int lane = elemindex / elem_per_lane;
2082   int eindex = elemindex % elem_per_lane;
2083 
2084   if (lane >= 2) {
2085     assert(UseAVX > 2, "required");
2086     vextractf32x4(dst, src, lane & 3);
2087     return dst;
2088   } else if (lane > 0) {
2089     assert(UseAVX > 0, "required");
2090     vextractf128(dst, src, lane);
2091     return dst;
2092   } else {
2093     return src;
2094   }
2095 }
2096 
2097 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2098   int esize =  type2aelembytes(typ);
2099   int elem_per_lane = 16/esize;
2100   int eindex = elemindex % elem_per_lane;
2101   assert(is_integral_type(typ),"required");
2102 
2103   if (eindex == 0) {
2104     if (typ == T_LONG) {
2105       movq(dst, src);
2106     } else {
2107       movdl(dst, src);
2108       if (typ == T_BYTE)
2109         movsbl(dst, dst);
2110       else if (typ == T_SHORT)
2111         movswl(dst, dst);
2112     }
2113   } else {
2114     extract(typ, dst, src, eindex);
2115   }
2116 }
2117 
2118 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) {
2119   int esize =  type2aelembytes(typ);
2120   int elem_per_lane = 16/esize;
2121   int eindex = elemindex % elem_per_lane;
2122   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2123 
2124   if (eindex == 0) {
2125     movq(dst, src);
2126   } else {
2127     if (typ == T_FLOAT) {
2128       if (UseAVX == 0) {
2129         movdqu(dst, src);
2130         pshufps(dst, dst, eindex);
2131       } else {
2132         vpshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2133       }
2134     } else {
2135       if (UseAVX == 0) {
2136         movdqu(dst, src);
2137         psrldq(dst, eindex*esize);
2138       } else {
2139         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2140       }
2141       movq(dst, dst);
2142     }
2143   }
2144   // Zero upper bits
2145   if (typ == T_FLOAT) {
2146     if (UseAVX == 0) {
2147       assert((vtmp != xnoreg) && (tmp != noreg), "required.");
2148       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp);
2149       pand(dst, vtmp);
2150     } else {
2151       assert((tmp != noreg), "required.");
2152       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp);
2153     }
2154   }
2155 }
2156 
2157 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2158   switch(typ) {
2159     case T_BYTE:
2160     case T_BOOLEAN:
2161       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2162       break;
2163     case T_SHORT:
2164     case T_CHAR:
2165       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2166       break;
2167     case T_INT:
2168     case T_FLOAT:
2169       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2170       break;
2171     case T_LONG:
2172     case T_DOUBLE:
2173       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2174       break;
2175     default:
2176       assert(false,"Should not reach here.");
2177       break;
2178   }
2179 }
2180 
2181 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) {
2182   switch(typ) {
2183     case T_BOOLEAN:
2184     case T_BYTE:
2185       evpcmpb(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2186       break;
2187     case T_CHAR:
2188     case T_SHORT:
2189       evpcmpw(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2190       break;
2191     case T_INT:
2192     case T_FLOAT:
2193       evpcmpd(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2194       break;
2195     case T_LONG:
2196     case T_DOUBLE:
2197       evpcmpq(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2198       break;
2199     default:
2200       assert(false,"Should not reach here.");
2201       break;
2202   }
2203 }
2204 
2205 void C2_MacroAssembler::vpcmpu(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison,
2206                             int vlen_in_bytes, XMMRegister vtmp1, XMMRegister vtmp2, Register scratch) {
2207   int vlen_enc = vector_length_encoding(vlen_in_bytes*2);
2208   switch (typ) {
2209   case T_BYTE:
2210     vpmovzxbw(vtmp1, src1, vlen_enc);
2211     vpmovzxbw(vtmp2, src2, vlen_enc);
2212     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2213     vpacksswb(dst, dst, dst, vlen_enc);
2214     break;
2215   case T_SHORT:
2216     vpmovzxwd(vtmp1, src1, vlen_enc);
2217     vpmovzxwd(vtmp2, src2, vlen_enc);
2218     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);
2219     vpackssdw(dst, dst, dst, vlen_enc);
2220     break;
2221   case T_INT:
2222     vpmovzxdq(vtmp1, src1, vlen_enc);
2223     vpmovzxdq(vtmp2, src2, vlen_enc);
2224     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);
2225     vpermilps(dst, dst, 8, vlen_enc);
2226     break;
2227   default:
2228     assert(false, "Should not reach here");
2229   }
2230   if (vlen_in_bytes == 16) {
2231     vpermpd(dst, dst, 0x8, vlen_enc);
2232   }
2233 }
2234 
2235 void C2_MacroAssembler::vpcmpu32(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison, int vlen_in_bytes,
2236                               XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, Register scratch) {
2237   int vlen_enc = vector_length_encoding(vlen_in_bytes);
2238   switch (typ) {
2239   case T_BYTE:
2240     vpmovzxbw(vtmp1, src1, vlen_enc);
2241     vpmovzxbw(vtmp2, src2, vlen_enc);
2242     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2243     vextracti128(vtmp1, src1, 1);
2244     vextracti128(vtmp2, src2, 1);
2245     vpmovzxbw(vtmp1, vtmp1, vlen_enc);
2246     vpmovzxbw(vtmp2, vtmp2, vlen_enc);
2247     vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2248     vpacksswb(dst, dst, vtmp3, vlen_enc);
2249     vpermpd(dst, dst, 0xd8, vlen_enc);
2250     break;
2251   case T_SHORT:
2252     vpmovzxwd(vtmp1, src1, vlen_enc);
2253     vpmovzxwd(vtmp2, src2, vlen_enc);
2254     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);
2255     vextracti128(vtmp1, src1, 1);
2256     vextracti128(vtmp2, src2, 1);
2257     vpmovzxwd(vtmp1, vtmp1, vlen_enc);
2258     vpmovzxwd(vtmp2, vtmp2, vlen_enc);
2259     vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::D,  vlen_enc, scratch);
2260     vpackssdw(dst, dst, vtmp3, vlen_enc);
2261     vpermpd(dst, dst, 0xd8, vlen_enc);
2262     break;
2263   case T_INT:
2264     vpmovzxdq(vtmp1, src1, vlen_enc);
2265     vpmovzxdq(vtmp2, src2, vlen_enc);
2266     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);
2267     vpshufd(dst, dst, 8, vlen_enc);
2268     vpermq(dst, dst, 8, vlen_enc);
2269     vextracti128(vtmp1, src1, 1);
2270     vextracti128(vtmp2, src2, 1);
2271     vpmovzxdq(vtmp1, vtmp1, vlen_enc);
2272     vpmovzxdq(vtmp2, vtmp2, vlen_enc);
2273     vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::Q,  vlen_enc, scratch);
2274     vpshufd(vtmp3, vtmp3, 8, vlen_enc);
2275     vpermq(vtmp3, vtmp3, 0x80, vlen_enc);
2276     vpblendd(dst, dst, vtmp3, 0xf0, vlen_enc);
2277     break;
2278   default:
2279     assert(false, "Should not reach here");
2280   }
2281 }
2282 
2283 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2284   switch(typ) {
2285     case T_BYTE:
2286       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2287       break;
2288     case T_SHORT:
2289       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2290       break;
2291     case T_INT:
2292     case T_FLOAT:
2293       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2294       break;
2295     case T_LONG:
2296     case T_DOUBLE:
2297       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2298       break;
2299     default:
2300       assert(false,"Should not reach here.");
2301       break;
2302   }
2303 }
2304 
2305 void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2,
2306                                    XMMRegister vtmp1, XMMRegister vtmp2, KRegister mask) {
2307   switch(vlen) {
2308     case 4:
2309       assert(vtmp1 != xnoreg, "required.");
2310       // Broadcast lower 32 bits to 128 bits before ptest
2311       pshufd(vtmp1, src1, 0x0);
2312       if (bt == BoolTest::overflow) {
2313         assert(vtmp2 != xnoreg, "required.");
2314         pshufd(vtmp2, src2, 0x0);
2315       } else {
2316         assert(vtmp2 == xnoreg, "required.");
2317         vtmp2 = src2;
2318       }
2319       ptest(vtmp1, vtmp2);
2320      break;
2321     case 8:
2322       assert(vtmp1 != xnoreg, "required.");
2323       // Broadcast lower 64 bits to 128 bits before ptest
2324       pshufd(vtmp1, src1, 0x4);
2325       if (bt == BoolTest::overflow) {
2326         assert(vtmp2 != xnoreg, "required.");
2327         pshufd(vtmp2, src2, 0x4);
2328       } else {
2329         assert(vtmp2 == xnoreg, "required.");
2330         vtmp2 = src2;
2331       }
2332       ptest(vtmp1, vtmp2);
2333      break;
2334     case 16:
2335       assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2336       ptest(src1, src2);
2337       break;
2338     case 32:
2339       assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2340       vptest(src1, src2, Assembler::AVX_256bit);
2341       break;
2342     case 64:
2343       {
2344         assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2345         evpcmpeqb(mask, src1, src2, Assembler::AVX_512bit);
2346         if (bt == BoolTest::ne) {
2347           ktestql(mask, mask);
2348         } else {
2349           assert(bt == BoolTest::overflow, "required");
2350           kortestql(mask, mask);
2351         }
2352       }
2353       break;
2354     default:
2355       assert(false,"Should not reach here.");
2356       break;
2357   }
2358 }
2359 
2360 //-------------------------------------------------------------------------------------------
2361 
2362 // IndexOf for constant substrings with size >= 8 chars
2363 // which don't need to be loaded through stack.
2364 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2365                                          Register cnt1, Register cnt2,
2366                                          int int_cnt2,  Register result,
2367                                          XMMRegister vec, Register tmp,
2368                                          int ae) {
2369   ShortBranchVerifier sbv(this);
2370   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2371   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2372 
2373   // This method uses the pcmpestri instruction with bound registers
2374   //   inputs:
2375   //     xmm - substring
2376   //     rax - substring length (elements count)
2377   //     mem - scanned string
2378   //     rdx - string length (elements count)
2379   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2380   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2381   //   outputs:
2382   //     rcx - matched index in string
2383   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2384   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2385   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2386   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2387   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2388 
2389   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2390         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2391         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2392 
2393   // Note, inline_string_indexOf() generates checks:
2394   // if (substr.count > string.count) return -1;
2395   // if (substr.count == 0) return 0;
2396   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2397 
2398   // Load substring.
2399   if (ae == StrIntrinsicNode::UL) {
2400     pmovzxbw(vec, Address(str2, 0));
2401   } else {
2402     movdqu(vec, Address(str2, 0));
2403   }
2404   movl(cnt2, int_cnt2);
2405   movptr(result, str1); // string addr
2406 
2407   if (int_cnt2 > stride) {
2408     jmpb(SCAN_TO_SUBSTR);
2409 
2410     // Reload substr for rescan, this code
2411     // is executed only for large substrings (> 8 chars)
2412     bind(RELOAD_SUBSTR);
2413     if (ae == StrIntrinsicNode::UL) {
2414       pmovzxbw(vec, Address(str2, 0));
2415     } else {
2416       movdqu(vec, Address(str2, 0));
2417     }
2418     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2419 
2420     bind(RELOAD_STR);
2421     // We came here after the beginning of the substring was
2422     // matched but the rest of it was not so we need to search
2423     // again. Start from the next element after the previous match.
2424 
2425     // cnt2 is number of substring reminding elements and
2426     // cnt1 is number of string reminding elements when cmp failed.
2427     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2428     subl(cnt1, cnt2);
2429     addl(cnt1, int_cnt2);
2430     movl(cnt2, int_cnt2); // Now restore cnt2
2431 
2432     decrementl(cnt1);     // Shift to next element
2433     cmpl(cnt1, cnt2);
2434     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2435 
2436     addptr(result, (1<<scale1));
2437 
2438   } // (int_cnt2 > 8)
2439 
2440   // Scan string for start of substr in 16-byte vectors
2441   bind(SCAN_TO_SUBSTR);
2442   pcmpestri(vec, Address(result, 0), mode);
2443   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2444   subl(cnt1, stride);
2445   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2446   cmpl(cnt1, cnt2);
2447   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2448   addptr(result, 16);
2449   jmpb(SCAN_TO_SUBSTR);
2450 
2451   // Found a potential substr
2452   bind(FOUND_CANDIDATE);
2453   // Matched whole vector if first element matched (tmp(rcx) == 0).
2454   if (int_cnt2 == stride) {
2455     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2456   } else { // int_cnt2 > 8
2457     jccb(Assembler::overflow, FOUND_SUBSTR);
2458   }
2459   // After pcmpestri tmp(rcx) contains matched element index
2460   // Compute start addr of substr
2461   lea(result, Address(result, tmp, scale1));
2462 
2463   // Make sure string is still long enough
2464   subl(cnt1, tmp);
2465   cmpl(cnt1, cnt2);
2466   if (int_cnt2 == stride) {
2467     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2468   } else { // int_cnt2 > 8
2469     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2470   }
2471   // Left less then substring.
2472 
2473   bind(RET_NOT_FOUND);
2474   movl(result, -1);
2475   jmp(EXIT);
2476 
2477   if (int_cnt2 > stride) {
2478     // This code is optimized for the case when whole substring
2479     // is matched if its head is matched.
2480     bind(MATCH_SUBSTR_HEAD);
2481     pcmpestri(vec, Address(result, 0), mode);
2482     // Reload only string if does not match
2483     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2484 
2485     Label CONT_SCAN_SUBSTR;
2486     // Compare the rest of substring (> 8 chars).
2487     bind(FOUND_SUBSTR);
2488     // First 8 chars are already matched.
2489     negptr(cnt2);
2490     addptr(cnt2, stride);
2491 
2492     bind(SCAN_SUBSTR);
2493     subl(cnt1, stride);
2494     cmpl(cnt2, -stride); // Do not read beyond substring
2495     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2496     // Back-up strings to avoid reading beyond substring:
2497     // cnt1 = cnt1 - cnt2 + 8
2498     addl(cnt1, cnt2); // cnt2 is negative
2499     addl(cnt1, stride);
2500     movl(cnt2, stride); negptr(cnt2);
2501     bind(CONT_SCAN_SUBSTR);
2502     if (int_cnt2 < (int)G) {
2503       int tail_off1 = int_cnt2<<scale1;
2504       int tail_off2 = int_cnt2<<scale2;
2505       if (ae == StrIntrinsicNode::UL) {
2506         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2507       } else {
2508         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2509       }
2510       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2511     } else {
2512       // calculate index in register to avoid integer overflow (int_cnt2*2)
2513       movl(tmp, int_cnt2);
2514       addptr(tmp, cnt2);
2515       if (ae == StrIntrinsicNode::UL) {
2516         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2517       } else {
2518         movdqu(vec, Address(str2, tmp, scale2, 0));
2519       }
2520       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2521     }
2522     // Need to reload strings pointers if not matched whole vector
2523     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2524     addptr(cnt2, stride);
2525     jcc(Assembler::negative, SCAN_SUBSTR);
2526     // Fall through if found full substring
2527 
2528   } // (int_cnt2 > 8)
2529 
2530   bind(RET_FOUND);
2531   // Found result if we matched full small substring.
2532   // Compute substr offset
2533   subptr(result, str1);
2534   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2535     shrl(result, 1); // index
2536   }
2537   bind(EXIT);
2538 
2539 } // string_indexofC8
2540 
2541 // Small strings are loaded through stack if they cross page boundary.
2542 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2543                                        Register cnt1, Register cnt2,
2544                                        int int_cnt2,  Register result,
2545                                        XMMRegister vec, Register tmp,
2546                                        int ae) {
2547   ShortBranchVerifier sbv(this);
2548   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2549   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2550 
2551   //
2552   // int_cnt2 is length of small (< 8 chars) constant substring
2553   // or (-1) for non constant substring in which case its length
2554   // is in cnt2 register.
2555   //
2556   // Note, inline_string_indexOf() generates checks:
2557   // if (substr.count > string.count) return -1;
2558   // if (substr.count == 0) return 0;
2559   //
2560   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2561   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2562   // This method uses the pcmpestri instruction with bound registers
2563   //   inputs:
2564   //     xmm - substring
2565   //     rax - substring length (elements count)
2566   //     mem - scanned string
2567   //     rdx - string length (elements count)
2568   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2569   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2570   //   outputs:
2571   //     rcx - matched index in string
2572   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2573   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2574   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2575   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2576 
2577   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2578         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2579         FOUND_CANDIDATE;
2580 
2581   { //========================================================
2582     // We don't know where these strings are located
2583     // and we can't read beyond them. Load them through stack.
2584     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2585 
2586     movptr(tmp, rsp); // save old SP
2587 
2588     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2589       if (int_cnt2 == (1>>scale2)) { // One byte
2590         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2591         load_unsigned_byte(result, Address(str2, 0));
2592         movdl(vec, result); // move 32 bits
2593       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2594         // Not enough header space in 32-bit VM: 12+3 = 15.
2595         movl(result, Address(str2, -1));
2596         shrl(result, 8);
2597         movdl(vec, result); // move 32 bits
2598       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2599         load_unsigned_short(result, Address(str2, 0));
2600         movdl(vec, result); // move 32 bits
2601       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2602         movdl(vec, Address(str2, 0)); // move 32 bits
2603       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2604         movq(vec, Address(str2, 0));  // move 64 bits
2605       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2606         // Array header size is 12 bytes in 32-bit VM
2607         // + 6 bytes for 3 chars == 18 bytes,
2608         // enough space to load vec and shift.
2609         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2610         if (ae == StrIntrinsicNode::UL) {
2611           int tail_off = int_cnt2-8;
2612           pmovzxbw(vec, Address(str2, tail_off));
2613           psrldq(vec, -2*tail_off);
2614         }
2615         else {
2616           int tail_off = int_cnt2*(1<<scale2);
2617           movdqu(vec, Address(str2, tail_off-16));
2618           psrldq(vec, 16-tail_off);
2619         }
2620       }
2621     } else { // not constant substring
2622       cmpl(cnt2, stride);
2623       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2624 
2625       // We can read beyond string if srt+16 does not cross page boundary
2626       // since heaps are aligned and mapped by pages.
2627       assert(os::vm_page_size() < (int)G, "default page should be small");
2628       movl(result, str2); // We need only low 32 bits
2629       andl(result, (os::vm_page_size()-1));
2630       cmpl(result, (os::vm_page_size()-16));
2631       jccb(Assembler::belowEqual, CHECK_STR);
2632 
2633       // Move small strings to stack to allow load 16 bytes into vec.
2634       subptr(rsp, 16);
2635       int stk_offset = wordSize-(1<<scale2);
2636       push(cnt2);
2637 
2638       bind(COPY_SUBSTR);
2639       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2640         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2641         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2642       } else if (ae == StrIntrinsicNode::UU) {
2643         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2644         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2645       }
2646       decrement(cnt2);
2647       jccb(Assembler::notZero, COPY_SUBSTR);
2648 
2649       pop(cnt2);
2650       movptr(str2, rsp);  // New substring address
2651     } // non constant
2652 
2653     bind(CHECK_STR);
2654     cmpl(cnt1, stride);
2655     jccb(Assembler::aboveEqual, BIG_STRINGS);
2656 
2657     // Check cross page boundary.
2658     movl(result, str1); // We need only low 32 bits
2659     andl(result, (os::vm_page_size()-1));
2660     cmpl(result, (os::vm_page_size()-16));
2661     jccb(Assembler::belowEqual, BIG_STRINGS);
2662 
2663     subptr(rsp, 16);
2664     int stk_offset = -(1<<scale1);
2665     if (int_cnt2 < 0) { // not constant
2666       push(cnt2);
2667       stk_offset += wordSize;
2668     }
2669     movl(cnt2, cnt1);
2670 
2671     bind(COPY_STR);
2672     if (ae == StrIntrinsicNode::LL) {
2673       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2674       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2675     } else {
2676       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2677       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2678     }
2679     decrement(cnt2);
2680     jccb(Assembler::notZero, COPY_STR);
2681 
2682     if (int_cnt2 < 0) { // not constant
2683       pop(cnt2);
2684     }
2685     movptr(str1, rsp);  // New string address
2686 
2687     bind(BIG_STRINGS);
2688     // Load substring.
2689     if (int_cnt2 < 0) { // -1
2690       if (ae == StrIntrinsicNode::UL) {
2691         pmovzxbw(vec, Address(str2, 0));
2692       } else {
2693         movdqu(vec, Address(str2, 0));
2694       }
2695       push(cnt2);       // substr count
2696       push(str2);       // substr addr
2697       push(str1);       // string addr
2698     } else {
2699       // Small (< 8 chars) constant substrings are loaded already.
2700       movl(cnt2, int_cnt2);
2701     }
2702     push(tmp);  // original SP
2703 
2704   } // Finished loading
2705 
2706   //========================================================
2707   // Start search
2708   //
2709 
2710   movptr(result, str1); // string addr
2711 
2712   if (int_cnt2  < 0) {  // Only for non constant substring
2713     jmpb(SCAN_TO_SUBSTR);
2714 
2715     // SP saved at sp+0
2716     // String saved at sp+1*wordSize
2717     // Substr saved at sp+2*wordSize
2718     // Substr count saved at sp+3*wordSize
2719 
2720     // Reload substr for rescan, this code
2721     // is executed only for large substrings (> 8 chars)
2722     bind(RELOAD_SUBSTR);
2723     movptr(str2, Address(rsp, 2*wordSize));
2724     movl(cnt2, Address(rsp, 3*wordSize));
2725     if (ae == StrIntrinsicNode::UL) {
2726       pmovzxbw(vec, Address(str2, 0));
2727     } else {
2728       movdqu(vec, Address(str2, 0));
2729     }
2730     // We came here after the beginning of the substring was
2731     // matched but the rest of it was not so we need to search
2732     // again. Start from the next element after the previous match.
2733     subptr(str1, result); // Restore counter
2734     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2735       shrl(str1, 1);
2736     }
2737     addl(cnt1, str1);
2738     decrementl(cnt1);   // Shift to next element
2739     cmpl(cnt1, cnt2);
2740     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2741 
2742     addptr(result, (1<<scale1));
2743   } // non constant
2744 
2745   // Scan string for start of substr in 16-byte vectors
2746   bind(SCAN_TO_SUBSTR);
2747   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2748   pcmpestri(vec, Address(result, 0), mode);
2749   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2750   subl(cnt1, stride);
2751   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2752   cmpl(cnt1, cnt2);
2753   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2754   addptr(result, 16);
2755 
2756   bind(ADJUST_STR);
2757   cmpl(cnt1, stride); // Do not read beyond string
2758   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2759   // Back-up string to avoid reading beyond string.
2760   lea(result, Address(result, cnt1, scale1, -16));
2761   movl(cnt1, stride);
2762   jmpb(SCAN_TO_SUBSTR);
2763 
2764   // Found a potential substr
2765   bind(FOUND_CANDIDATE);
2766   // After pcmpestri tmp(rcx) contains matched element index
2767 
2768   // Make sure string is still long enough
2769   subl(cnt1, tmp);
2770   cmpl(cnt1, cnt2);
2771   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
2772   // Left less then substring.
2773 
2774   bind(RET_NOT_FOUND);
2775   movl(result, -1);
2776   jmp(CLEANUP);
2777 
2778   bind(FOUND_SUBSTR);
2779   // Compute start addr of substr
2780   lea(result, Address(result, tmp, scale1));
2781   if (int_cnt2 > 0) { // Constant substring
2782     // Repeat search for small substring (< 8 chars)
2783     // from new point without reloading substring.
2784     // Have to check that we don't read beyond string.
2785     cmpl(tmp, stride-int_cnt2);
2786     jccb(Assembler::greater, ADJUST_STR);
2787     // Fall through if matched whole substring.
2788   } else { // non constant
2789     assert(int_cnt2 == -1, "should be != 0");
2790 
2791     addl(tmp, cnt2);
2792     // Found result if we matched whole substring.
2793     cmpl(tmp, stride);
2794     jcc(Assembler::lessEqual, RET_FOUND);
2795 
2796     // Repeat search for small substring (<= 8 chars)
2797     // from new point 'str1' without reloading substring.
2798     cmpl(cnt2, stride);
2799     // Have to check that we don't read beyond string.
2800     jccb(Assembler::lessEqual, ADJUST_STR);
2801 
2802     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
2803     // Compare the rest of substring (> 8 chars).
2804     movptr(str1, result);
2805 
2806     cmpl(tmp, cnt2);
2807     // First 8 chars are already matched.
2808     jccb(Assembler::equal, CHECK_NEXT);
2809 
2810     bind(SCAN_SUBSTR);
2811     pcmpestri(vec, Address(str1, 0), mode);
2812     // Need to reload strings pointers if not matched whole vector
2813     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2814 
2815     bind(CHECK_NEXT);
2816     subl(cnt2, stride);
2817     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
2818     addptr(str1, 16);
2819     if (ae == StrIntrinsicNode::UL) {
2820       addptr(str2, 8);
2821     } else {
2822       addptr(str2, 16);
2823     }
2824     subl(cnt1, stride);
2825     cmpl(cnt2, stride); // Do not read beyond substring
2826     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
2827     // Back-up strings to avoid reading beyond substring.
2828 
2829     if (ae == StrIntrinsicNode::UL) {
2830       lea(str2, Address(str2, cnt2, scale2, -8));
2831       lea(str1, Address(str1, cnt2, scale1, -16));
2832     } else {
2833       lea(str2, Address(str2, cnt2, scale2, -16));
2834       lea(str1, Address(str1, cnt2, scale1, -16));
2835     }
2836     subl(cnt1, cnt2);
2837     movl(cnt2, stride);
2838     addl(cnt1, stride);
2839     bind(CONT_SCAN_SUBSTR);
2840     if (ae == StrIntrinsicNode::UL) {
2841       pmovzxbw(vec, Address(str2, 0));
2842     } else {
2843       movdqu(vec, Address(str2, 0));
2844     }
2845     jmp(SCAN_SUBSTR);
2846 
2847     bind(RET_FOUND_LONG);
2848     movptr(str1, Address(rsp, wordSize));
2849   } // non constant
2850 
2851   bind(RET_FOUND);
2852   // Compute substr offset
2853   subptr(result, str1);
2854   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2855     shrl(result, 1); // index
2856   }
2857   bind(CLEANUP);
2858   pop(rsp); // restore SP
2859 
2860 } // string_indexof
2861 
2862 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
2863                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
2864   ShortBranchVerifier sbv(this);
2865   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2866 
2867   int stride = 8;
2868 
2869   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
2870         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
2871         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
2872         FOUND_SEQ_CHAR, DONE_LABEL;
2873 
2874   movptr(result, str1);
2875   if (UseAVX >= 2) {
2876     cmpl(cnt1, stride);
2877     jcc(Assembler::less, SCAN_TO_CHAR);
2878     cmpl(cnt1, 2*stride);
2879     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
2880     movdl(vec1, ch);
2881     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
2882     vpxor(vec2, vec2);
2883     movl(tmp, cnt1);
2884     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
2885     andl(cnt1,0x0000000F);  //tail count (in chars)
2886 
2887     bind(SCAN_TO_16_CHAR_LOOP);
2888     vmovdqu(vec3, Address(result, 0));
2889     vpcmpeqw(vec3, vec3, vec1, 1);
2890     vptest(vec2, vec3);
2891     jcc(Assembler::carryClear, FOUND_CHAR);
2892     addptr(result, 32);
2893     subl(tmp, 2*stride);
2894     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
2895     jmp(SCAN_TO_8_CHAR);
2896     bind(SCAN_TO_8_CHAR_INIT);
2897     movdl(vec1, ch);
2898     pshuflw(vec1, vec1, 0x00);
2899     pshufd(vec1, vec1, 0);
2900     pxor(vec2, vec2);
2901   }
2902   bind(SCAN_TO_8_CHAR);
2903   cmpl(cnt1, stride);
2904   jcc(Assembler::less, SCAN_TO_CHAR);
2905   if (UseAVX < 2) {
2906     movdl(vec1, ch);
2907     pshuflw(vec1, vec1, 0x00);
2908     pshufd(vec1, vec1, 0);
2909     pxor(vec2, vec2);
2910   }
2911   movl(tmp, cnt1);
2912   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
2913   andl(cnt1,0x00000007);  //tail count (in chars)
2914 
2915   bind(SCAN_TO_8_CHAR_LOOP);
2916   movdqu(vec3, Address(result, 0));
2917   pcmpeqw(vec3, vec1);
2918   ptest(vec2, vec3);
2919   jcc(Assembler::carryClear, FOUND_CHAR);
2920   addptr(result, 16);
2921   subl(tmp, stride);
2922   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
2923   bind(SCAN_TO_CHAR);
2924   testl(cnt1, cnt1);
2925   jcc(Assembler::zero, RET_NOT_FOUND);
2926   bind(SCAN_TO_CHAR_LOOP);
2927   load_unsigned_short(tmp, Address(result, 0));
2928   cmpl(ch, tmp);
2929   jccb(Assembler::equal, FOUND_SEQ_CHAR);
2930   addptr(result, 2);
2931   subl(cnt1, 1);
2932   jccb(Assembler::zero, RET_NOT_FOUND);
2933   jmp(SCAN_TO_CHAR_LOOP);
2934 
2935   bind(RET_NOT_FOUND);
2936   movl(result, -1);
2937   jmpb(DONE_LABEL);
2938 
2939   bind(FOUND_CHAR);
2940   if (UseAVX >= 2) {
2941     vpmovmskb(tmp, vec3);
2942   } else {
2943     pmovmskb(tmp, vec3);
2944   }
2945   bsfl(ch, tmp);
2946   addptr(result, ch);
2947 
2948   bind(FOUND_SEQ_CHAR);
2949   subptr(result, str1);
2950   shrl(result, 1);
2951 
2952   bind(DONE_LABEL);
2953 } // string_indexof_char
2954 
2955 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
2956                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
2957   ShortBranchVerifier sbv(this);
2958   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2959 
2960   int stride = 16;
2961 
2962   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
2963         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
2964         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
2965         FOUND_SEQ_CHAR, DONE_LABEL;
2966 
2967   movptr(result, str1);
2968   if (UseAVX >= 2) {
2969     cmpl(cnt1, stride);
2970     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
2971     cmpl(cnt1, stride*2);
2972     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
2973     movdl(vec1, ch);
2974     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
2975     vpxor(vec2, vec2);
2976     movl(tmp, cnt1);
2977     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
2978     andl(cnt1,0x0000001F);  //tail count (in chars)
2979 
2980     bind(SCAN_TO_32_CHAR_LOOP);
2981     vmovdqu(vec3, Address(result, 0));
2982     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
2983     vptest(vec2, vec3);
2984     jcc(Assembler::carryClear, FOUND_CHAR);
2985     addptr(result, 32);
2986     subl(tmp, stride*2);
2987     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
2988     jmp(SCAN_TO_16_CHAR);
2989 
2990     bind(SCAN_TO_16_CHAR_INIT);
2991     movdl(vec1, ch);
2992     pxor(vec2, vec2);
2993     pshufb(vec1, vec2);
2994   }
2995 
2996   bind(SCAN_TO_16_CHAR);
2997   cmpl(cnt1, stride);
2998   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entires left
2999   if (UseAVX < 2) {
3000     movdl(vec1, ch);
3001     pxor(vec2, vec2);
3002     pshufb(vec1, vec2);
3003   }
3004   movl(tmp, cnt1);
3005   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3006   andl(cnt1,0x0000000F);  //tail count (in bytes)
3007 
3008   bind(SCAN_TO_16_CHAR_LOOP);
3009   movdqu(vec3, Address(result, 0));
3010   pcmpeqb(vec3, vec1);
3011   ptest(vec2, vec3);
3012   jcc(Assembler::carryClear, FOUND_CHAR);
3013   addptr(result, 16);
3014   subl(tmp, stride);
3015   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3016 
3017   bind(SCAN_TO_CHAR_INIT);
3018   testl(cnt1, cnt1);
3019   jcc(Assembler::zero, RET_NOT_FOUND);
3020   bind(SCAN_TO_CHAR_LOOP);
3021   load_unsigned_byte(tmp, Address(result, 0));
3022   cmpl(ch, tmp);
3023   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3024   addptr(result, 1);
3025   subl(cnt1, 1);
3026   jccb(Assembler::zero, RET_NOT_FOUND);
3027   jmp(SCAN_TO_CHAR_LOOP);
3028 
3029   bind(RET_NOT_FOUND);
3030   movl(result, -1);
3031   jmpb(DONE_LABEL);
3032 
3033   bind(FOUND_CHAR);
3034   if (UseAVX >= 2) {
3035     vpmovmskb(tmp, vec3);
3036   } else {
3037     pmovmskb(tmp, vec3);
3038   }
3039   bsfl(ch, tmp);
3040   addptr(result, ch);
3041 
3042   bind(FOUND_SEQ_CHAR);
3043   subptr(result, str1);
3044 
3045   bind(DONE_LABEL);
3046 } // stringL_indexof_char
3047 
3048 // helper function for string_compare
3049 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3050                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3051                                            Address::ScaleFactor scale2, Register index, int ae) {
3052   if (ae == StrIntrinsicNode::LL) {
3053     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3054     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3055   } else if (ae == StrIntrinsicNode::UU) {
3056     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3057     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3058   } else {
3059     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3060     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3061   }
3062 }
3063 
3064 // Compare strings, used for char[] and byte[].
3065 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3066                                        Register cnt1, Register cnt2, Register result,
3067                                        XMMRegister vec1, int ae, KRegister mask) {
3068   ShortBranchVerifier sbv(this);
3069   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3070   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3071   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3072   int stride2x2 = 0x40;
3073   Address::ScaleFactor scale = Address::no_scale;
3074   Address::ScaleFactor scale1 = Address::no_scale;
3075   Address::ScaleFactor scale2 = Address::no_scale;
3076 
3077   if (ae != StrIntrinsicNode::LL) {
3078     stride2x2 = 0x20;
3079   }
3080 
3081   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3082     shrl(cnt2, 1);
3083   }
3084   // Compute the minimum of the string lengths and the
3085   // difference of the string lengths (stack).
3086   // Do the conditional move stuff
3087   movl(result, cnt1);
3088   subl(cnt1, cnt2);
3089   push(cnt1);
3090   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3091 
3092   // Is the minimum length zero?
3093   testl(cnt2, cnt2);
3094   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3095   if (ae == StrIntrinsicNode::LL) {
3096     // Load first bytes
3097     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3098     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3099   } else if (ae == StrIntrinsicNode::UU) {
3100     // Load first characters
3101     load_unsigned_short(result, Address(str1, 0));
3102     load_unsigned_short(cnt1, Address(str2, 0));
3103   } else {
3104     load_unsigned_byte(result, Address(str1, 0));
3105     load_unsigned_short(cnt1, Address(str2, 0));
3106   }
3107   subl(result, cnt1);
3108   jcc(Assembler::notZero,  POP_LABEL);
3109 
3110   if (ae == StrIntrinsicNode::UU) {
3111     // Divide length by 2 to get number of chars
3112     shrl(cnt2, 1);
3113   }
3114   cmpl(cnt2, 1);
3115   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3116 
3117   // Check if the strings start at the same location and setup scale and stride
3118   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3119     cmpptr(str1, str2);
3120     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3121     if (ae == StrIntrinsicNode::LL) {
3122       scale = Address::times_1;
3123       stride = 16;
3124     } else {
3125       scale = Address::times_2;
3126       stride = 8;
3127     }
3128   } else {
3129     scale1 = Address::times_1;
3130     scale2 = Address::times_2;
3131     // scale not used
3132     stride = 8;
3133   }
3134 
3135   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3136     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3137     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3138     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3139     Label COMPARE_TAIL_LONG;
3140     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3141 
3142     int pcmpmask = 0x19;
3143     if (ae == StrIntrinsicNode::LL) {
3144       pcmpmask &= ~0x01;
3145     }
3146 
3147     // Setup to compare 16-chars (32-bytes) vectors,
3148     // start from first character again because it has aligned address.
3149     if (ae == StrIntrinsicNode::LL) {
3150       stride2 = 32;
3151     } else {
3152       stride2 = 16;
3153     }
3154     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3155       adr_stride = stride << scale;
3156     } else {
3157       adr_stride1 = 8;  //stride << scale1;
3158       adr_stride2 = 16; //stride << scale2;
3159     }
3160 
3161     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3162     // rax and rdx are used by pcmpestri as elements counters
3163     movl(result, cnt2);
3164     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3165     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3166 
3167     // fast path : compare first 2 8-char vectors.
3168     bind(COMPARE_16_CHARS);
3169     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3170       movdqu(vec1, Address(str1, 0));
3171     } else {
3172       pmovzxbw(vec1, Address(str1, 0));
3173     }
3174     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3175     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3176 
3177     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3178       movdqu(vec1, Address(str1, adr_stride));
3179       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3180     } else {
3181       pmovzxbw(vec1, Address(str1, adr_stride1));
3182       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3183     }
3184     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3185     addl(cnt1, stride);
3186 
3187     // Compare the characters at index in cnt1
3188     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3189     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3190     subl(result, cnt2);
3191     jmp(POP_LABEL);
3192 
3193     // Setup the registers to start vector comparison loop
3194     bind(COMPARE_WIDE_VECTORS);
3195     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3196       lea(str1, Address(str1, result, scale));
3197       lea(str2, Address(str2, result, scale));
3198     } else {
3199       lea(str1, Address(str1, result, scale1));
3200       lea(str2, Address(str2, result, scale2));
3201     }
3202     subl(result, stride2);
3203     subl(cnt2, stride2);
3204     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3205     negptr(result);
3206 
3207     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3208     bind(COMPARE_WIDE_VECTORS_LOOP);
3209 
3210 #ifdef _LP64
3211     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3212       cmpl(cnt2, stride2x2);
3213       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3214       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3215       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3216 
3217       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3218       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3219         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3220         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3221       } else {
3222         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3223         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3224       }
3225       kortestql(mask, mask);
3226       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3227       addptr(result, stride2x2);  // update since we already compared at this addr
3228       subl(cnt2, stride2x2);      // and sub the size too
3229       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3230 
3231       vpxor(vec1, vec1);
3232       jmpb(COMPARE_WIDE_TAIL);
3233     }//if (VM_Version::supports_avx512vlbw())
3234 #endif // _LP64
3235 
3236 
3237     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3238     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3239       vmovdqu(vec1, Address(str1, result, scale));
3240       vpxor(vec1, Address(str2, result, scale));
3241     } else {
3242       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3243       vpxor(vec1, Address(str2, result, scale2));
3244     }
3245     vptest(vec1, vec1);
3246     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3247     addptr(result, stride2);
3248     subl(cnt2, stride2);
3249     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3250     // clean upper bits of YMM registers
3251     vpxor(vec1, vec1);
3252 
3253     // compare wide vectors tail
3254     bind(COMPARE_WIDE_TAIL);
3255     testptr(result, result);
3256     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3257 
3258     movl(result, stride2);
3259     movl(cnt2, result);
3260     negptr(result);
3261     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3262 
3263     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3264     bind(VECTOR_NOT_EQUAL);
3265     // clean upper bits of YMM registers
3266     vpxor(vec1, vec1);
3267     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3268       lea(str1, Address(str1, result, scale));
3269       lea(str2, Address(str2, result, scale));
3270     } else {
3271       lea(str1, Address(str1, result, scale1));
3272       lea(str2, Address(str2, result, scale2));
3273     }
3274     jmp(COMPARE_16_CHARS);
3275 
3276     // Compare tail chars, length between 1 to 15 chars
3277     bind(COMPARE_TAIL_LONG);
3278     movl(cnt2, result);
3279     cmpl(cnt2, stride);
3280     jcc(Assembler::less, COMPARE_SMALL_STR);
3281 
3282     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3283       movdqu(vec1, Address(str1, 0));
3284     } else {
3285       pmovzxbw(vec1, Address(str1, 0));
3286     }
3287     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3288     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3289     subptr(cnt2, stride);
3290     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3291     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3292       lea(str1, Address(str1, result, scale));
3293       lea(str2, Address(str2, result, scale));
3294     } else {
3295       lea(str1, Address(str1, result, scale1));
3296       lea(str2, Address(str2, result, scale2));
3297     }
3298     negptr(cnt2);
3299     jmpb(WHILE_HEAD_LABEL);
3300 
3301     bind(COMPARE_SMALL_STR);
3302   } else if (UseSSE42Intrinsics) {
3303     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3304     int pcmpmask = 0x19;
3305     // Setup to compare 8-char (16-byte) vectors,
3306     // start from first character again because it has aligned address.
3307     movl(result, cnt2);
3308     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3309     if (ae == StrIntrinsicNode::LL) {
3310       pcmpmask &= ~0x01;
3311     }
3312     jcc(Assembler::zero, COMPARE_TAIL);
3313     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3314       lea(str1, Address(str1, result, scale));
3315       lea(str2, Address(str2, result, scale));
3316     } else {
3317       lea(str1, Address(str1, result, scale1));
3318       lea(str2, Address(str2, result, scale2));
3319     }
3320     negptr(result);
3321 
3322     // pcmpestri
3323     //   inputs:
3324     //     vec1- substring
3325     //     rax - negative string length (elements count)
3326     //     mem - scanned string
3327     //     rdx - string length (elements count)
3328     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3329     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3330     //   outputs:
3331     //     rcx - first mismatched element index
3332     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3333 
3334     bind(COMPARE_WIDE_VECTORS);
3335     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3336       movdqu(vec1, Address(str1, result, scale));
3337       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3338     } else {
3339       pmovzxbw(vec1, Address(str1, result, scale1));
3340       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3341     }
3342     // After pcmpestri cnt1(rcx) contains mismatched element index
3343 
3344     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3345     addptr(result, stride);
3346     subptr(cnt2, stride);
3347     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3348 
3349     // compare wide vectors tail
3350     testptr(result, result);
3351     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3352 
3353     movl(cnt2, stride);
3354     movl(result, stride);
3355     negptr(result);
3356     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3357       movdqu(vec1, Address(str1, result, scale));
3358       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3359     } else {
3360       pmovzxbw(vec1, Address(str1, result, scale1));
3361       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3362     }
3363     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3364 
3365     // Mismatched characters in the vectors
3366     bind(VECTOR_NOT_EQUAL);
3367     addptr(cnt1, result);
3368     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3369     subl(result, cnt2);
3370     jmpb(POP_LABEL);
3371 
3372     bind(COMPARE_TAIL); // limit is zero
3373     movl(cnt2, result);
3374     // Fallthru to tail compare
3375   }
3376   // Shift str2 and str1 to the end of the arrays, negate min
3377   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3378     lea(str1, Address(str1, cnt2, scale));
3379     lea(str2, Address(str2, cnt2, scale));
3380   } else {
3381     lea(str1, Address(str1, cnt2, scale1));
3382     lea(str2, Address(str2, cnt2, scale2));
3383   }
3384   decrementl(cnt2);  // first character was compared already
3385   negptr(cnt2);
3386 
3387   // Compare the rest of the elements
3388   bind(WHILE_HEAD_LABEL);
3389   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3390   subl(result, cnt1);
3391   jccb(Assembler::notZero, POP_LABEL);
3392   increment(cnt2);
3393   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3394 
3395   // Strings are equal up to min length.  Return the length difference.
3396   bind(LENGTH_DIFF_LABEL);
3397   pop(result);
3398   if (ae == StrIntrinsicNode::UU) {
3399     // Divide diff by 2 to get number of chars
3400     sarl(result, 1);
3401   }
3402   jmpb(DONE_LABEL);
3403 
3404 #ifdef _LP64
3405   if (VM_Version::supports_avx512vlbw()) {
3406 
3407     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3408 
3409     kmovql(cnt1, mask);
3410     notq(cnt1);
3411     bsfq(cnt2, cnt1);
3412     if (ae != StrIntrinsicNode::LL) {
3413       // Divide diff by 2 to get number of chars
3414       sarl(cnt2, 1);
3415     }
3416     addq(result, cnt2);
3417     if (ae == StrIntrinsicNode::LL) {
3418       load_unsigned_byte(cnt1, Address(str2, result));
3419       load_unsigned_byte(result, Address(str1, result));
3420     } else if (ae == StrIntrinsicNode::UU) {
3421       load_unsigned_short(cnt1, Address(str2, result, scale));
3422       load_unsigned_short(result, Address(str1, result, scale));
3423     } else {
3424       load_unsigned_short(cnt1, Address(str2, result, scale2));
3425       load_unsigned_byte(result, Address(str1, result, scale1));
3426     }
3427     subl(result, cnt1);
3428     jmpb(POP_LABEL);
3429   }//if (VM_Version::supports_avx512vlbw())
3430 #endif // _LP64
3431 
3432   // Discard the stored length difference
3433   bind(POP_LABEL);
3434   pop(cnt1);
3435 
3436   // That's it
3437   bind(DONE_LABEL);
3438   if(ae == StrIntrinsicNode::UL) {
3439     negl(result);
3440   }
3441 
3442 }
3443 
3444 // Search for Non-ASCII character (Negative byte value) in a byte array,
3445 // return true if it has any and false otherwise.
3446 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3447 //   @IntrinsicCandidate
3448 //   private static boolean hasNegatives(byte[] ba, int off, int len) {
3449 //     for (int i = off; i < off + len; i++) {
3450 //       if (ba[i] < 0) {
3451 //         return true;
3452 //       }
3453 //     }
3454 //     return false;
3455 //   }
3456 void C2_MacroAssembler::has_negatives(Register ary1, Register len,
3457   Register result, Register tmp1,
3458   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3459   // rsi: byte array
3460   // rcx: len
3461   // rax: result
3462   ShortBranchVerifier sbv(this);
3463   assert_different_registers(ary1, len, result, tmp1);
3464   assert_different_registers(vec1, vec2);
3465   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3466 
3467   // len == 0
3468   testl(len, len);
3469   jcc(Assembler::zero, FALSE_LABEL);
3470 
3471   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3472     VM_Version::supports_avx512vlbw() &&
3473     VM_Version::supports_bmi2()) {
3474 
3475     Label test_64_loop, test_tail;
3476     Register tmp3_aliased = len;
3477 
3478     movl(tmp1, len);
3479     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3480 
3481     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
3482     andl(len, ~(64 - 1));    // vector count (in chars)
3483     jccb(Assembler::zero, test_tail);
3484 
3485     lea(ary1, Address(ary1, len, Address::times_1));
3486     negptr(len);
3487 
3488     bind(test_64_loop);
3489     // Check whether our 64 elements of size byte contain negatives
3490     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3491     kortestql(mask1, mask1);
3492     jcc(Assembler::notZero, TRUE_LABEL);
3493 
3494     addptr(len, 64);
3495     jccb(Assembler::notZero, test_64_loop);
3496 
3497 
3498     bind(test_tail);
3499     // bail out when there is nothing to be done
3500     testl(tmp1, -1);
3501     jcc(Assembler::zero, FALSE_LABEL);
3502 
3503     // ~(~0 << len) applied up to two times (for 32-bit scenario)
3504 #ifdef _LP64
3505     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3506     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3507     notq(tmp3_aliased);
3508     kmovql(mask2, tmp3_aliased);
3509 #else
3510     Label k_init;
3511     jmp(k_init);
3512 
3513     // We could not read 64-bits from a general purpose register thus we move
3514     // data required to compose 64 1's to the instruction stream
3515     // We emit 64 byte wide series of elements from 0..63 which later on would
3516     // be used as a compare targets with tail count contained in tmp1 register.
3517     // Result would be a k register having tmp1 consecutive number or 1
3518     // counting from least significant bit.
3519     address tmp = pc();
3520     emit_int64(0x0706050403020100);
3521     emit_int64(0x0F0E0D0C0B0A0908);
3522     emit_int64(0x1716151413121110);
3523     emit_int64(0x1F1E1D1C1B1A1918);
3524     emit_int64(0x2726252423222120);
3525     emit_int64(0x2F2E2D2C2B2A2928);
3526     emit_int64(0x3736353433323130);
3527     emit_int64(0x3F3E3D3C3B3A3938);
3528 
3529     bind(k_init);
3530     lea(len, InternalAddress(tmp));
3531     // create mask to test for negative byte inside a vector
3532     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3533     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
3534 
3535 #endif
3536     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3537     ktestq(mask1, mask2);
3538     jcc(Assembler::notZero, TRUE_LABEL);
3539 
3540     jmp(FALSE_LABEL);
3541   } else {
3542     movl(result, len); // copy
3543 
3544     if (UseAVX >= 2 && UseSSE >= 2) {
3545       // With AVX2, use 32-byte vector compare
3546       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3547 
3548       // Compare 32-byte vectors
3549       andl(result, 0x0000001f);  //   tail count (in bytes)
3550       andl(len, 0xffffffe0);   // vector count (in bytes)
3551       jccb(Assembler::zero, COMPARE_TAIL);
3552 
3553       lea(ary1, Address(ary1, len, Address::times_1));
3554       negptr(len);
3555 
3556       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
3557       movdl(vec2, tmp1);
3558       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
3559 
3560       bind(COMPARE_WIDE_VECTORS);
3561       vmovdqu(vec1, Address(ary1, len, Address::times_1));
3562       vptest(vec1, vec2);
3563       jccb(Assembler::notZero, TRUE_LABEL);
3564       addptr(len, 32);
3565       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3566 
3567       testl(result, result);
3568       jccb(Assembler::zero, FALSE_LABEL);
3569 
3570       vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3571       vptest(vec1, vec2);
3572       jccb(Assembler::notZero, TRUE_LABEL);
3573       jmpb(FALSE_LABEL);
3574 
3575       bind(COMPARE_TAIL); // len is zero
3576       movl(len, result);
3577       // Fallthru to tail compare
3578     } else if (UseSSE42Intrinsics) {
3579       // With SSE4.2, use double quad vector compare
3580       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3581 
3582       // Compare 16-byte vectors
3583       andl(result, 0x0000000f);  //   tail count (in bytes)
3584       andl(len, 0xfffffff0);   // vector count (in bytes)
3585       jcc(Assembler::zero, COMPARE_TAIL);
3586 
3587       lea(ary1, Address(ary1, len, Address::times_1));
3588       negptr(len);
3589 
3590       movl(tmp1, 0x80808080);
3591       movdl(vec2, tmp1);
3592       pshufd(vec2, vec2, 0);
3593 
3594       bind(COMPARE_WIDE_VECTORS);
3595       movdqu(vec1, Address(ary1, len, Address::times_1));
3596       ptest(vec1, vec2);
3597       jcc(Assembler::notZero, TRUE_LABEL);
3598       addptr(len, 16);
3599       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3600 
3601       testl(result, result);
3602       jcc(Assembler::zero, FALSE_LABEL);
3603 
3604       movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3605       ptest(vec1, vec2);
3606       jccb(Assembler::notZero, TRUE_LABEL);
3607       jmpb(FALSE_LABEL);
3608 
3609       bind(COMPARE_TAIL); // len is zero
3610       movl(len, result);
3611       // Fallthru to tail compare
3612     }
3613   }
3614   // Compare 4-byte vectors
3615   andl(len, 0xfffffffc); // vector count (in bytes)
3616   jccb(Assembler::zero, COMPARE_CHAR);
3617 
3618   lea(ary1, Address(ary1, len, Address::times_1));
3619   negptr(len);
3620 
3621   bind(COMPARE_VECTORS);
3622   movl(tmp1, Address(ary1, len, Address::times_1));
3623   andl(tmp1, 0x80808080);
3624   jccb(Assembler::notZero, TRUE_LABEL);
3625   addptr(len, 4);
3626   jcc(Assembler::notZero, COMPARE_VECTORS);
3627 
3628   // Compare trailing char (final 2 bytes), if any
3629   bind(COMPARE_CHAR);
3630   testl(result, 0x2);   // tail  char
3631   jccb(Assembler::zero, COMPARE_BYTE);
3632   load_unsigned_short(tmp1, Address(ary1, 0));
3633   andl(tmp1, 0x00008080);
3634   jccb(Assembler::notZero, TRUE_LABEL);
3635   subptr(result, 2);
3636   lea(ary1, Address(ary1, 2));
3637 
3638   bind(COMPARE_BYTE);
3639   testl(result, 0x1);   // tail  byte
3640   jccb(Assembler::zero, FALSE_LABEL);
3641   load_unsigned_byte(tmp1, Address(ary1, 0));
3642   andl(tmp1, 0x00000080);
3643   jccb(Assembler::notEqual, TRUE_LABEL);
3644   jmpb(FALSE_LABEL);
3645 
3646   bind(TRUE_LABEL);
3647   movl(result, 1);   // return true
3648   jmpb(DONE);
3649 
3650   bind(FALSE_LABEL);
3651   xorl(result, result); // return false
3652 
3653   // That's it
3654   bind(DONE);
3655   if (UseAVX >= 2 && UseSSE >= 2) {
3656     // clean upper bits of YMM registers
3657     vpxor(vec1, vec1);
3658     vpxor(vec2, vec2);
3659   }
3660 }
3661 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
3662 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
3663                                       Register limit, Register result, Register chr,
3664                                       XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
3665   ShortBranchVerifier sbv(this);
3666   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
3667 
3668   int length_offset  = arrayOopDesc::length_offset_in_bytes();
3669   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
3670 
3671   if (is_array_equ) {
3672     // Check the input args
3673     cmpoop(ary1, ary2);
3674     jcc(Assembler::equal, TRUE_LABEL);
3675 
3676     // Need additional checks for arrays_equals.
3677     testptr(ary1, ary1);
3678     jcc(Assembler::zero, FALSE_LABEL);
3679     testptr(ary2, ary2);
3680     jcc(Assembler::zero, FALSE_LABEL);
3681 
3682     // Check the lengths
3683     movl(limit, Address(ary1, length_offset));
3684     cmpl(limit, Address(ary2, length_offset));
3685     jcc(Assembler::notEqual, FALSE_LABEL);
3686   }
3687 
3688   // count == 0
3689   testl(limit, limit);
3690   jcc(Assembler::zero, TRUE_LABEL);
3691 
3692   if (is_array_equ) {
3693     // Load array address
3694     lea(ary1, Address(ary1, base_offset));
3695     lea(ary2, Address(ary2, base_offset));
3696   }
3697 
3698   if (is_array_equ && is_char) {
3699     // arrays_equals when used for char[].
3700     shll(limit, 1);      // byte count != 0
3701   }
3702   movl(result, limit); // copy
3703 
3704   if (UseAVX >= 2) {
3705     // With AVX2, use 32-byte vector compare
3706     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3707 
3708     // Compare 32-byte vectors
3709     andl(result, 0x0000001f);  //   tail count (in bytes)
3710     andl(limit, 0xffffffe0);   // vector count (in bytes)
3711     jcc(Assembler::zero, COMPARE_TAIL);
3712 
3713     lea(ary1, Address(ary1, limit, Address::times_1));
3714     lea(ary2, Address(ary2, limit, Address::times_1));
3715     negptr(limit);
3716 
3717 #ifdef _LP64
3718     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3719       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
3720 
3721       cmpl(limit, -64);
3722       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3723 
3724       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3725 
3726       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
3727       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
3728       kortestql(mask, mask);
3729       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3730       addptr(limit, 64);  // update since we already compared at this addr
3731       cmpl(limit, -64);
3732       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3733 
3734       // At this point we may still need to compare -limit+result bytes.
3735       // We could execute the next two instruction and just continue via non-wide path:
3736       //  cmpl(limit, 0);
3737       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
3738       // But since we stopped at the points ary{1,2}+limit which are
3739       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
3740       // (|limit| <= 32 and result < 32),
3741       // we may just compare the last 64 bytes.
3742       //
3743       addptr(result, -64);   // it is safe, bc we just came from this area
3744       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
3745       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
3746       kortestql(mask, mask);
3747       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3748 
3749       jmp(TRUE_LABEL);
3750 
3751       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3752 
3753     }//if (VM_Version::supports_avx512vlbw())
3754 #endif //_LP64
3755     bind(COMPARE_WIDE_VECTORS);
3756     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
3757     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
3758     vpxor(vec1, vec2);
3759 
3760     vptest(vec1, vec1);
3761     jcc(Assembler::notZero, FALSE_LABEL);
3762     addptr(limit, 32);
3763     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3764 
3765     testl(result, result);
3766     jcc(Assembler::zero, TRUE_LABEL);
3767 
3768     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3769     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
3770     vpxor(vec1, vec2);
3771 
3772     vptest(vec1, vec1);
3773     jccb(Assembler::notZero, FALSE_LABEL);
3774     jmpb(TRUE_LABEL);
3775 
3776     bind(COMPARE_TAIL); // limit is zero
3777     movl(limit, result);
3778     // Fallthru to tail compare
3779   } else if (UseSSE42Intrinsics) {
3780     // With SSE4.2, use double quad vector compare
3781     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3782 
3783     // Compare 16-byte vectors
3784     andl(result, 0x0000000f);  //   tail count (in bytes)
3785     andl(limit, 0xfffffff0);   // vector count (in bytes)
3786     jcc(Assembler::zero, COMPARE_TAIL);
3787 
3788     lea(ary1, Address(ary1, limit, Address::times_1));
3789     lea(ary2, Address(ary2, limit, Address::times_1));
3790     negptr(limit);
3791 
3792     bind(COMPARE_WIDE_VECTORS);
3793     movdqu(vec1, Address(ary1, limit, Address::times_1));
3794     movdqu(vec2, Address(ary2, limit, Address::times_1));
3795     pxor(vec1, vec2);
3796 
3797     ptest(vec1, vec1);
3798     jcc(Assembler::notZero, FALSE_LABEL);
3799     addptr(limit, 16);
3800     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3801 
3802     testl(result, result);
3803     jcc(Assembler::zero, TRUE_LABEL);
3804 
3805     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3806     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
3807     pxor(vec1, vec2);
3808 
3809     ptest(vec1, vec1);
3810     jccb(Assembler::notZero, FALSE_LABEL);
3811     jmpb(TRUE_LABEL);
3812 
3813     bind(COMPARE_TAIL); // limit is zero
3814     movl(limit, result);
3815     // Fallthru to tail compare
3816   }
3817 
3818   // Compare 4-byte vectors
3819   andl(limit, 0xfffffffc); // vector count (in bytes)
3820   jccb(Assembler::zero, COMPARE_CHAR);
3821 
3822   lea(ary1, Address(ary1, limit, Address::times_1));
3823   lea(ary2, Address(ary2, limit, Address::times_1));
3824   negptr(limit);
3825 
3826   bind(COMPARE_VECTORS);
3827   movl(chr, Address(ary1, limit, Address::times_1));
3828   cmpl(chr, Address(ary2, limit, Address::times_1));
3829   jccb(Assembler::notEqual, FALSE_LABEL);
3830   addptr(limit, 4);
3831   jcc(Assembler::notZero, COMPARE_VECTORS);
3832 
3833   // Compare trailing char (final 2 bytes), if any
3834   bind(COMPARE_CHAR);
3835   testl(result, 0x2);   // tail  char
3836   jccb(Assembler::zero, COMPARE_BYTE);
3837   load_unsigned_short(chr, Address(ary1, 0));
3838   load_unsigned_short(limit, Address(ary2, 0));
3839   cmpl(chr, limit);
3840   jccb(Assembler::notEqual, FALSE_LABEL);
3841 
3842   if (is_array_equ && is_char) {
3843     bind(COMPARE_BYTE);
3844   } else {
3845     lea(ary1, Address(ary1, 2));
3846     lea(ary2, Address(ary2, 2));
3847 
3848     bind(COMPARE_BYTE);
3849     testl(result, 0x1);   // tail  byte
3850     jccb(Assembler::zero, TRUE_LABEL);
3851     load_unsigned_byte(chr, Address(ary1, 0));
3852     load_unsigned_byte(limit, Address(ary2, 0));
3853     cmpl(chr, limit);
3854     jccb(Assembler::notEqual, FALSE_LABEL);
3855   }
3856   bind(TRUE_LABEL);
3857   movl(result, 1);   // return true
3858   jmpb(DONE);
3859 
3860   bind(FALSE_LABEL);
3861   xorl(result, result); // return false
3862 
3863   // That's it
3864   bind(DONE);
3865   if (UseAVX >= 2) {
3866     // clean upper bits of YMM registers
3867     vpxor(vec1, vec1);
3868     vpxor(vec2, vec2);
3869   }
3870 }
3871 
3872 #ifdef _LP64
3873 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
3874                                               Register tmp, KRegister ktmp, int masklen, int vec_enc) {
3875   assert(VM_Version::supports_avx512vlbw(), "");
3876   vpxor(xtmp, xtmp, xtmp, vec_enc);
3877   vpsubb(xtmp, xtmp, mask, vec_enc);
3878   evpmovb2m(ktmp, xtmp, vec_enc);
3879   kmovql(tmp, ktmp);
3880   switch(opc) {
3881     case Op_VectorMaskTrueCount:
3882       popcntq(dst, tmp);
3883       break;
3884     case Op_VectorMaskLastTrue:
3885       mov64(dst, -1);
3886       bsrq(tmp, tmp);
3887       cmov(Assembler::notZero, dst, tmp);
3888       break;
3889     case Op_VectorMaskFirstTrue:
3890       mov64(dst, masklen);
3891       bsfq(tmp, tmp);
3892       cmov(Assembler::notZero, dst, tmp);
3893       break;
3894     default: assert(false, "Unhandled mask operation");
3895   }
3896 }
3897 
3898 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
3899                                               XMMRegister xtmp1, Register tmp, int masklen, int vec_enc) {
3900   assert(VM_Version::supports_avx(), "");
3901   vpxor(xtmp, xtmp, xtmp, vec_enc);
3902   vpsubb(xtmp, xtmp, mask, vec_enc);
3903   vpmovmskb(tmp, xtmp, vec_enc);
3904   if (masklen < 64) {
3905     andq(tmp, (((jlong)1 << masklen) - 1));
3906   }
3907   switch(opc) {
3908     case Op_VectorMaskTrueCount:
3909       popcntq(dst, tmp);
3910       break;
3911     case Op_VectorMaskLastTrue:
3912       mov64(dst, -1);
3913       bsrq(tmp, tmp);
3914       cmov(Assembler::notZero, dst, tmp);
3915       break;
3916     case Op_VectorMaskFirstTrue:
3917       mov64(dst, masklen);
3918       bsfq(tmp, tmp);
3919       cmov(Assembler::notZero, dst, tmp);
3920       break;
3921     default: assert(false, "Unhandled mask operation");
3922   }
3923 }
3924 #endif
3925 
3926 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
3927                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
3928                                         int vlen_enc) {
3929   assert(VM_Version::supports_avx512bw(), "");
3930   // Byte shuffles are inlane operations and indices are determined using
3931   // lower 4 bit of each shuffle lane, thus all shuffle indices are
3932   // normalized to index range 0-15. This makes sure that all the multiples
3933   // of an index value are placed at same relative position in 128 bit
3934   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
3935   // will be 16th element in their respective 128 bit lanes.
3936   movl(rtmp, 16);
3937   evpbroadcastb(xtmp1, rtmp, vlen_enc);
3938 
3939   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
3940   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
3941   // original shuffle indices and move the shuffled lanes corresponding to true
3942   // mask to destination vector.
3943   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
3944   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
3945   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
3946 
3947   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
3948   // and broadcasting second 128 bit lane.
3949   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
3950   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
3951   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
3952   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
3953   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
3954 
3955   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
3956   // and broadcasting third 128 bit lane.
3957   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
3958   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
3959   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
3960   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
3961   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
3962 
3963   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
3964   // and broadcasting third 128 bit lane.
3965   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
3966   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
3967   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
3968   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
3969   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
3970 }