1 /*
   2  * Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "oops/methodData.hpp"
  29 #include "opto/c2_MacroAssembler.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/opcodes.hpp"
  32 #include "opto/subnode.hpp"
  33 #include "runtime/objectMonitor.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 
  36 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
  37   switch (vlen_in_bytes) {
  38     case  4: // fall-through
  39     case  8: // fall-through
  40     case 16: return Assembler::AVX_128bit;
  41     case 32: return Assembler::AVX_256bit;
  42     case 64: return Assembler::AVX_512bit;
  43 
  44     default: {
  45       ShouldNotReachHere();
  46       return Assembler::AVX_NoVec;
  47     }
  48   }
  49 }
  50 
  51 void C2_MacroAssembler::setvectmask(Register dst, Register src, KRegister mask) {
  52   guarantee(PostLoopMultiversioning, "must be");
  53   Assembler::movl(dst, 1);
  54   Assembler::shlxl(dst, dst, src);
  55   Assembler::decl(dst);
  56   Assembler::kmovdl(mask, dst);
  57   Assembler::movl(dst, src);
  58 }
  59 
  60 void C2_MacroAssembler::restorevectmask(KRegister mask) {
  61   guarantee(PostLoopMultiversioning, "must be");
  62   Assembler::knotwl(mask, k0);
  63 }
  64 
  65 #if INCLUDE_RTM_OPT
  66 
  67 // Update rtm_counters based on abort status
  68 // input: abort_status
  69 //        rtm_counters (RTMLockingCounters*)
  70 // flags are killed
  71 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
  72 
  73   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
  74   if (PrintPreciseRTMLockingStatistics) {
  75     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
  76       Label check_abort;
  77       testl(abort_status, (1<<i));
  78       jccb(Assembler::equal, check_abort);
  79       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
  80       bind(check_abort);
  81     }
  82   }
  83 }
  84 
  85 // Branch if (random & (count-1) != 0), count is 2^n
  86 // tmp, scr and flags are killed
  87 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
  88   assert(tmp == rax, "");
  89   assert(scr == rdx, "");
  90   rdtsc(); // modifies EDX:EAX
  91   andptr(tmp, count-1);
  92   jccb(Assembler::notZero, brLabel);
  93 }
  94 
  95 // Perform abort ratio calculation, set no_rtm bit if high ratio
  96 // input:  rtm_counters_Reg (RTMLockingCounters* address)
  97 // tmpReg, rtm_counters_Reg and flags are killed
  98 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
  99                                                     Register rtm_counters_Reg,
 100                                                     RTMLockingCounters* rtm_counters,
 101                                                     Metadata* method_data) {
 102   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 103 
 104   if (RTMLockingCalculationDelay > 0) {
 105     // Delay calculation
 106     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
 107     testptr(tmpReg, tmpReg);
 108     jccb(Assembler::equal, L_done);
 109   }
 110   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 111   //   Aborted transactions = abort_count * 100
 112   //   All transactions = total_count *  RTMTotalCountIncrRate
 113   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 114 
 115   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 116   cmpptr(tmpReg, RTMAbortThreshold);
 117   jccb(Assembler::below, L_check_always_rtm2);
 118   imulptr(tmpReg, tmpReg, 100);
 119 
 120   Register scrReg = rtm_counters_Reg;
 121   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 122   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 123   imulptr(scrReg, scrReg, RTMAbortRatio);
 124   cmpptr(tmpReg, scrReg);
 125   jccb(Assembler::below, L_check_always_rtm1);
 126   if (method_data != NULL) {
 127     // set rtm_state to "no rtm" in MDO
 128     mov_metadata(tmpReg, method_data);
 129     lock();
 130     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
 131   }
 132   jmpb(L_done);
 133   bind(L_check_always_rtm1);
 134   // Reload RTMLockingCounters* address
 135   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 136   bind(L_check_always_rtm2);
 137   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 138   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 139   jccb(Assembler::below, L_done);
 140   if (method_data != NULL) {
 141     // set rtm_state to "always rtm" in MDO
 142     mov_metadata(tmpReg, method_data);
 143     lock();
 144     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
 145   }
 146   bind(L_done);
 147 }
 148 
 149 // Update counters and perform abort ratio calculation
 150 // input:  abort_status_Reg
 151 // rtm_counters_Reg, flags are killed
 152 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 153                                       Register rtm_counters_Reg,
 154                                       RTMLockingCounters* rtm_counters,
 155                                       Metadata* method_data,
 156                                       bool profile_rtm) {
 157 
 158   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 159   // update rtm counters based on rax value at abort
 160   // reads abort_status_Reg, updates flags
 161   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 162   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 163   if (profile_rtm) {
 164     // Save abort status because abort_status_Reg is used by following code.
 165     if (RTMRetryCount > 0) {
 166       push(abort_status_Reg);
 167     }
 168     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 169     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 170     // restore abort status
 171     if (RTMRetryCount > 0) {
 172       pop(abort_status_Reg);
 173     }
 174   }
 175 }
 176 
 177 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 178 // inputs: retry_count_Reg
 179 //       : abort_status_Reg
 180 // output: retry_count_Reg decremented by 1
 181 // flags are killed
 182 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 183   Label doneRetry;
 184   assert(abort_status_Reg == rax, "");
 185   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 186   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 187   // if reason is in 0x6 and retry count != 0 then retry
 188   andptr(abort_status_Reg, 0x6);
 189   jccb(Assembler::zero, doneRetry);
 190   testl(retry_count_Reg, retry_count_Reg);
 191   jccb(Assembler::zero, doneRetry);
 192   pause();
 193   decrementl(retry_count_Reg);
 194   jmp(retryLabel);
 195   bind(doneRetry);
 196 }
 197 
 198 // Spin and retry if lock is busy,
 199 // inputs: box_Reg (monitor address)
 200 //       : retry_count_Reg
 201 // output: retry_count_Reg decremented by 1
 202 //       : clear z flag if retry count exceeded
 203 // tmp_Reg, scr_Reg, flags are killed
 204 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 205                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 206   Label SpinLoop, SpinExit, doneRetry;
 207   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 208 
 209   testl(retry_count_Reg, retry_count_Reg);
 210   jccb(Assembler::zero, doneRetry);
 211   decrementl(retry_count_Reg);
 212   movptr(scr_Reg, RTMSpinLoopCount);
 213 
 214   bind(SpinLoop);
 215   pause();
 216   decrementl(scr_Reg);
 217   jccb(Assembler::lessEqual, SpinExit);
 218   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 219   testptr(tmp_Reg, tmp_Reg);
 220   jccb(Assembler::notZero, SpinLoop);
 221 
 222   bind(SpinExit);
 223   jmp(retryLabel);
 224   bind(doneRetry);
 225   incrementl(retry_count_Reg); // clear z flag
 226 }
 227 
 228 // Use RTM for normal stack locks
 229 // Input: objReg (object to lock)
 230 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 231                                          Register retry_on_abort_count_Reg,
 232                                          RTMLockingCounters* stack_rtm_counters,
 233                                          Metadata* method_data, bool profile_rtm,
 234                                          Label& DONE_LABEL, Label& IsInflated) {
 235   assert(UseRTMForStackLocks, "why call this otherwise?");
 236   assert(tmpReg == rax, "");
 237   assert(scrReg == rdx, "");
 238   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 239 
 240   if (RTMRetryCount > 0) {
 241     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 242     bind(L_rtm_retry);
 243   }
 244   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 245   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral
 246   jcc(Assembler::notZero, IsInflated);
 247 
 248   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 249     Label L_noincrement;
 250     if (RTMTotalCountIncrRate > 1) {
 251       // tmpReg, scrReg and flags are killed
 252       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 253     }
 254     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
 255     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 256     bind(L_noincrement);
 257   }
 258   xbegin(L_on_abort);
 259   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 260   andptr(tmpReg, markWord::lock_mask_in_place);     // look at 2 lock bits
 261   cmpptr(tmpReg, markWord::unlocked_value);         // bits = 01 unlocked
 262   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 263 
 264   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 265   if (UseRTMXendForLockBusy) {
 266     xend();
 267     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 268     jmp(L_decrement_retry);
 269   }
 270   else {
 271     xabort(0);
 272   }
 273   bind(L_on_abort);
 274   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 275     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 276   }
 277   bind(L_decrement_retry);
 278   if (RTMRetryCount > 0) {
 279     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 280     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 281   }
 282 }
 283 
 284 // Use RTM for inflating locks
 285 // inputs: objReg (object to lock)
 286 //         boxReg (on-stack box address (displaced header location) - KILLED)
 287 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 288 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 289                                             Register scrReg, Register retry_on_busy_count_Reg,
 290                                             Register retry_on_abort_count_Reg,
 291                                             RTMLockingCounters* rtm_counters,
 292                                             Metadata* method_data, bool profile_rtm,
 293                                             Label& DONE_LABEL) {
 294   assert(UseRTMLocking, "why call this otherwise?");
 295   assert(tmpReg == rax, "");
 296   assert(scrReg == rdx, "");
 297   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 298   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 299 
 300   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 301   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
 302   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 303 
 304   if (RTMRetryCount > 0) {
 305     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 306     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 307     bind(L_rtm_retry);
 308   }
 309   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 310     Label L_noincrement;
 311     if (RTMTotalCountIncrRate > 1) {
 312       // tmpReg, scrReg and flags are killed
 313       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 314     }
 315     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 316     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 317     bind(L_noincrement);
 318   }
 319   xbegin(L_on_abort);
 320   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 321   movptr(tmpReg, Address(tmpReg, owner_offset));
 322   testptr(tmpReg, tmpReg);
 323   jcc(Assembler::zero, DONE_LABEL);
 324   if (UseRTMXendForLockBusy) {
 325     xend();
 326     jmp(L_decrement_retry);
 327   }
 328   else {
 329     xabort(0);
 330   }
 331   bind(L_on_abort);
 332   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 333   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 334     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 335   }
 336   if (RTMRetryCount > 0) {
 337     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 338     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 339   }
 340 
 341   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 342   testptr(tmpReg, tmpReg) ;
 343   jccb(Assembler::notZero, L_decrement_retry) ;
 344 
 345   // Appears unlocked - try to swing _owner from null to non-null.
 346   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 347 #ifdef _LP64
 348   Register threadReg = r15_thread;
 349 #else
 350   get_thread(scrReg);
 351   Register threadReg = scrReg;
 352 #endif
 353   lock();
 354   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 355 
 356   if (RTMRetryCount > 0) {
 357     // success done else retry
 358     jccb(Assembler::equal, DONE_LABEL) ;
 359     bind(L_decrement_retry);
 360     // Spin and retry if lock is busy.
 361     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 362   }
 363   else {
 364     bind(L_decrement_retry);
 365   }
 366 }
 367 
 368 #endif //  INCLUDE_RTM_OPT
 369 
 370 // fast_lock and fast_unlock used by C2
 371 
 372 // Because the transitions from emitted code to the runtime
 373 // monitorenter/exit helper stubs are so slow it's critical that
 374 // we inline both the stack-locking fast path and the inflated fast path.
 375 //
 376 // See also: cmpFastLock and cmpFastUnlock.
 377 //
 378 // What follows is a specialized inline transliteration of the code
 379 // in enter() and exit(). If we're concerned about I$ bloat another
 380 // option would be to emit TrySlowEnter and TrySlowExit methods
 381 // at startup-time.  These methods would accept arguments as
 382 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 383 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 384 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 385 // In practice, however, the # of lock sites is bounded and is usually small.
 386 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 387 // if the processor uses simple bimodal branch predictors keyed by EIP
 388 // Since the helper routines would be called from multiple synchronization
 389 // sites.
 390 //
 391 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 392 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 393 // to those specialized methods.  That'd give us a mostly platform-independent
 394 // implementation that the JITs could optimize and inline at their pleasure.
 395 // Done correctly, the only time we'd need to cross to native could would be
 396 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 397 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 398 // (b) explicit barriers or fence operations.
 399 //
 400 // TODO:
 401 //
 402 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 403 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 404 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 405 //    the lock operators would typically be faster than reifying Self.
 406 //
 407 // *  Ideally I'd define the primitives as:
 408 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 409 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 410 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 411 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 412 //    Furthermore the register assignments are overconstrained, possibly resulting in
 413 //    sub-optimal code near the synchronization site.
 414 //
 415 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 416 //    Alternately, use a better sp-proximity test.
 417 //
 418 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 419 //    Either one is sufficient to uniquely identify a thread.
 420 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 421 //
 422 // *  Intrinsify notify() and notifyAll() for the common cases where the
 423 //    object is locked by the calling thread but the waitlist is empty.
 424 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 425 //
 426 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 427 //    But beware of excessive branch density on AMD Opterons.
 428 //
 429 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 430 //    or failure of the fast path.  If the fast path fails then we pass
 431 //    control to the slow path, typically in C.  In fast_lock and
 432 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 433 //    will emit a conditional branch immediately after the node.
 434 //    So we have branches to branches and lots of ICC.ZF games.
 435 //    Instead, it might be better to have C2 pass a "FailureLabel"
 436 //    into fast_lock and fast_unlock.  In the case of success, control
 437 //    will drop through the node.  ICC.ZF is undefined at exit.
 438 //    In the case of failure, the node will branch directly to the
 439 //    FailureLabel
 440 
 441 
 442 // obj: object to lock
 443 // box: on-stack box address (displaced header location) - KILLED
 444 // rax,: tmp -- KILLED
 445 // scr: tmp -- KILLED
 446 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 447                                  Register scrReg, Register cx1Reg, Register cx2Reg,
 448                                  RTMLockingCounters* rtm_counters,
 449                                  RTMLockingCounters* stack_rtm_counters,
 450                                  Metadata* method_data,
 451                                  bool use_rtm, bool profile_rtm) {
 452   // Ensure the register assignments are disjoint
 453   assert(tmpReg == rax, "");
 454 
 455   if (use_rtm) {
 456     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 457   } else {
 458     assert(cx2Reg == noreg, "");
 459     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 460   }
 461 
 462   // Possible cases that we'll encounter in fast_lock
 463   // ------------------------------------------------
 464   // * Inflated
 465   //    -- unlocked
 466   //    -- Locked
 467   //       = by self
 468   //       = by other
 469   // * neutral
 470   // * stack-locked
 471   //    -- by self
 472   //       = sp-proximity test hits
 473   //       = sp-proximity test generates false-negative
 474   //    -- by other
 475   //
 476 
 477   Label IsInflated, DONE_LABEL;
 478 
 479   if (DiagnoseSyncOnValueBasedClasses != 0) {
 480     load_klass(tmpReg, objReg, cx1Reg);
 481     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 482     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 483     jcc(Assembler::notZero, DONE_LABEL);
 484   }
 485 
 486 #if INCLUDE_RTM_OPT
 487   if (UseRTMForStackLocks && use_rtm) {
 488     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 489                       stack_rtm_counters, method_data, profile_rtm,
 490                       DONE_LABEL, IsInflated);
 491   }
 492 #endif // INCLUDE_RTM_OPT
 493 
 494   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 495   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 496   jccb(Assembler::notZero, IsInflated);
 497 
 498   // Attempt stack-locking ...
 499   orptr (tmpReg, markWord::unlocked_value);
 500   if (EnableValhalla) {
 501     // Mask inline_type bit such that we go to the slow path if object is an inline type
 502     andptr(tmpReg, ~((int) markWord::inline_type_bit_in_place));
 503   }
 504   movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 505   lock();
 506   cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 507   jcc(Assembler::equal, DONE_LABEL);           // Success
 508 
 509   // Recursive locking.
 510   // The object is stack-locked: markword contains stack pointer to BasicLock.
 511   // Locked by current thread if difference with current SP is less than one page.
 512   subptr(tmpReg, rsp);
 513   // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 514   andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
 515   movptr(Address(boxReg, 0), tmpReg);
 516   jmp(DONE_LABEL);
 517 
 518   bind(IsInflated);
 519   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 520 
 521 #if INCLUDE_RTM_OPT
 522   // Use the same RTM locking code in 32- and 64-bit VM.
 523   if (use_rtm) {
 524     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 525                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 526   } else {
 527 #endif // INCLUDE_RTM_OPT
 528 
 529 #ifndef _LP64
 530   // The object is inflated.
 531 
 532   // boxReg refers to the on-stack BasicLock in the current frame.
 533   // We'd like to write:
 534   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 535   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 536   // additional latency as we have another ST in the store buffer that must drain.
 537 
 538   // avoid ST-before-CAS
 539   // register juggle because we need tmpReg for cmpxchgptr below
 540   movptr(scrReg, boxReg);
 541   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 542 
 543   // Optimistic form: consider XORL tmpReg,tmpReg
 544   movptr(tmpReg, NULL_WORD);
 545 
 546   // Appears unlocked - try to swing _owner from null to non-null.
 547   // Ideally, I'd manifest "Self" with get_thread and then attempt
 548   // to CAS the register containing Self into m->Owner.
 549   // But we don't have enough registers, so instead we can either try to CAS
 550   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 551   // we later store "Self" into m->Owner.  Transiently storing a stack address
 552   // (rsp or the address of the box) into  m->owner is harmless.
 553   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 554   lock();
 555   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 556   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 557   // If we weren't able to swing _owner from NULL to the BasicLock
 558   // then take the slow path.
 559   jccb  (Assembler::notZero, DONE_LABEL);
 560   // update _owner from BasicLock to thread
 561   get_thread (scrReg);                    // beware: clobbers ICCs
 562   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 563   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 564 
 565   // If the CAS fails we can either retry or pass control to the slow path.
 566   // We use the latter tactic.
 567   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 568   // If the CAS was successful ...
 569   //   Self has acquired the lock
 570   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 571   // Intentional fall-through into DONE_LABEL ...
 572 #else // _LP64
 573   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 574   movq(scrReg, tmpReg);
 575   xorq(tmpReg, tmpReg);
 576   lock();
 577   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 578   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 579   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 580   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
 581   // Intentional fall-through into DONE_LABEL ...
 582   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 583 #endif // _LP64
 584 #if INCLUDE_RTM_OPT
 585   } // use_rtm()
 586 #endif
 587   // DONE_LABEL is a hot target - we'd really like to place it at the
 588   // start of cache line by padding with NOPs.
 589   // See the AMD and Intel software optimization manuals for the
 590   // most efficient "long" NOP encodings.
 591   // Unfortunately none of our alignment mechanisms suffice.
 592   bind(DONE_LABEL);
 593 
 594   // At DONE_LABEL the icc ZFlag is set as follows ...
 595   // fast_unlock uses the same protocol.
 596   // ZFlag == 1 -> Success
 597   // ZFlag == 0 -> Failure - force control through the slow path
 598 }
 599 
 600 // obj: object to unlock
 601 // box: box address (displaced header location), killed.  Must be EAX.
 602 // tmp: killed, cannot be obj nor box.
 603 //
 604 // Some commentary on balanced locking:
 605 //
 606 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 607 // Methods that don't have provably balanced locking are forced to run in the
 608 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 609 // The interpreter provides two properties:
 610 // I1:  At return-time the interpreter automatically and quietly unlocks any
 611 //      objects acquired the current activation (frame).  Recall that the
 612 //      interpreter maintains an on-stack list of locks currently held by
 613 //      a frame.
 614 // I2:  If a method attempts to unlock an object that is not held by the
 615 //      the frame the interpreter throws IMSX.
 616 //
 617 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 618 // B() doesn't have provably balanced locking so it runs in the interpreter.
 619 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 620 // is still locked by A().
 621 //
 622 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 623 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 624 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 625 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 626 // Arguably given that the spec legislates the JNI case as undefined our implementation
 627 // could reasonably *avoid* checking owner in fast_unlock().
 628 // In the interest of performance we elide m->Owner==Self check in unlock.
 629 // A perfectly viable alternative is to elide the owner check except when
 630 // Xcheck:jni is enabled.
 631 
 632 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 633   assert(boxReg == rax, "");
 634   assert_different_registers(objReg, boxReg, tmpReg);
 635 
 636   Label DONE_LABEL, Stacked, CheckSucc;
 637 
 638 #if INCLUDE_RTM_OPT
 639   if (UseRTMForStackLocks && use_rtm) {
 640     Label L_regular_unlock;
 641     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 642     andptr(tmpReg, markWord::lock_mask_in_place);                     // look at 2 lock bits
 643     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 01 unlocked
 644     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 645     xend();                                                           // otherwise end...
 646     jmp(DONE_LABEL);                                                  // ... and we're done
 647     bind(L_regular_unlock);
 648   }
 649 #endif
 650 
 651   cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD);                   // Examine the displaced header
 652   jcc   (Assembler::zero, DONE_LABEL);                              // 0 indicates recursive stack-lock
 653   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
 654   testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 655   jccb  (Assembler::zero, Stacked);
 656 
 657   // It's inflated.
 658 #if INCLUDE_RTM_OPT
 659   if (use_rtm) {
 660     Label L_regular_inflated_unlock;
 661     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 662     movptr(boxReg, Address(tmpReg, owner_offset));
 663     testptr(boxReg, boxReg);
 664     jccb(Assembler::notZero, L_regular_inflated_unlock);
 665     xend();
 666     jmpb(DONE_LABEL);
 667     bind(L_regular_inflated_unlock);
 668   }
 669 #endif
 670 
 671   // Despite our balanced locking property we still check that m->_owner == Self
 672   // as java routines or native JNI code called by this thread might
 673   // have released the lock.
 674   // Refer to the comments in synchronizer.cpp for how we might encode extra
 675   // state in _succ so we can avoid fetching EntryList|cxq.
 676   //
 677   // I'd like to add more cases in fast_lock() and fast_unlock() --
 678   // such as recursive enter and exit -- but we have to be wary of
 679   // I$ bloat, T$ effects and BP$ effects.
 680   //
 681   // If there's no contention try a 1-0 exit.  That is, exit without
 682   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 683   // we detect and recover from the race that the 1-0 exit admits.
 684   //
 685   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 686   // before it STs null into _owner, releasing the lock.  Updates
 687   // to data protected by the critical section must be visible before
 688   // we drop the lock (and thus before any other thread could acquire
 689   // the lock and observe the fields protected by the lock).
 690   // IA32's memory-model is SPO, so STs are ordered with respect to
 691   // each other and there's no need for an explicit barrier (fence).
 692   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 693 #ifndef _LP64
 694   get_thread (boxReg);
 695 
 696   // Note that we could employ various encoding schemes to reduce
 697   // the number of loads below (currently 4) to just 2 or 3.
 698   // Refer to the comments in synchronizer.cpp.
 699   // In practice the chain of fetches doesn't seem to impact performance, however.
 700   xorptr(boxReg, boxReg);
 701   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 702   jccb  (Assembler::notZero, DONE_LABEL);
 703   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 704   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 705   jccb  (Assembler::notZero, CheckSucc);
 706   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 707   jmpb  (DONE_LABEL);
 708 
 709   bind (Stacked);
 710   // It's not inflated and it's not recursively stack-locked.
 711   // It must be stack-locked.
 712   // Try to reset the header to displaced header.
 713   // The "box" value on the stack is stable, so we can reload
 714   // and be assured we observe the same value as above.
 715   movptr(tmpReg, Address(boxReg, 0));
 716   lock();
 717   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 718   // Intention fall-thru into DONE_LABEL
 719 
 720   // DONE_LABEL is a hot target - we'd really like to place it at the
 721   // start of cache line by padding with NOPs.
 722   // See the AMD and Intel software optimization manuals for the
 723   // most efficient "long" NOP encodings.
 724   // Unfortunately none of our alignment mechanisms suffice.
 725   bind (CheckSucc);
 726 #else // _LP64
 727   // It's inflated
 728   xorptr(boxReg, boxReg);
 729   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 730   jccb  (Assembler::notZero, DONE_LABEL);
 731   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 732   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 733   jccb  (Assembler::notZero, CheckSucc);
 734   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 735   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 736   jmpb  (DONE_LABEL);
 737 
 738   // Try to avoid passing control into the slow_path ...
 739   Label LSuccess, LGoSlowPath ;
 740   bind  (CheckSucc);
 741 
 742   // The following optional optimization can be elided if necessary
 743   // Effectively: if (succ == null) goto slow path
 744   // The code reduces the window for a race, however,
 745   // and thus benefits performance.
 746   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 747   jccb  (Assembler::zero, LGoSlowPath);
 748 
 749   xorptr(boxReg, boxReg);
 750   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 751   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 752 
 753   // Memory barrier/fence
 754   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 755   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 756   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 757   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 758   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 759   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 760   lock(); addl(Address(rsp, 0), 0);
 761 
 762   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 763   jccb  (Assembler::notZero, LSuccess);
 764 
 765   // Rare inopportune interleaving - race.
 766   // The successor vanished in the small window above.
 767   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 768   // We need to ensure progress and succession.
 769   // Try to reacquire the lock.
 770   // If that fails then the new owner is responsible for succession and this
 771   // thread needs to take no further action and can exit via the fast path (success).
 772   // If the re-acquire succeeds then pass control into the slow path.
 773   // As implemented, this latter mode is horrible because we generated more
 774   // coherence traffic on the lock *and* artifically extended the critical section
 775   // length while by virtue of passing control into the slow path.
 776 
 777   // box is really RAX -- the following CMPXCHG depends on that binding
 778   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 779   lock();
 780   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 781   // There's no successor so we tried to regrab the lock.
 782   // If that didn't work, then another thread grabbed the
 783   // lock so we're done (and exit was a success).
 784   jccb  (Assembler::notEqual, LSuccess);
 785   // Intentional fall-through into slow path
 786 
 787   bind  (LGoSlowPath);
 788   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 789   jmpb  (DONE_LABEL);
 790 
 791   bind  (LSuccess);
 792   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 793   jmpb  (DONE_LABEL);
 794 
 795   bind  (Stacked);
 796   movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 797   lock();
 798   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 799 
 800 #endif
 801   bind(DONE_LABEL);
 802 }
 803 
 804 //-------------------------------------------------------------------------------------------
 805 // Generic instructions support for use in .ad files C2 code generation
 806 
 807 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
 808   if (dst != src) {
 809     movdqu(dst, src);
 810   }
 811   if (opcode == Op_AbsVD) {
 812     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr);
 813   } else {
 814     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 815     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr);
 816   }
 817 }
 818 
 819 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
 820   if (opcode == Op_AbsVD) {
 821     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr);
 822   } else {
 823     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 824     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr);
 825   }
 826 }
 827 
 828 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
 829   if (dst != src) {
 830     movdqu(dst, src);
 831   }
 832   if (opcode == Op_AbsVF) {
 833     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);
 834   } else {
 835     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 836     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);
 837   }
 838 }
 839 
 840 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
 841   if (opcode == Op_AbsVF) {
 842     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);
 843   } else {
 844     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 845     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
 846   }
 847 }
 848 
 849 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 850   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 851   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 852 
 853   if (opcode == Op_MinV) {
 854     if (elem_bt == T_BYTE) {
 855       pminsb(dst, src);
 856     } else if (elem_bt == T_SHORT) {
 857       pminsw(dst, src);
 858     } else if (elem_bt == T_INT) {
 859       pminsd(dst, src);
 860     } else {
 861       assert(elem_bt == T_LONG, "required");
 862       assert(tmp == xmm0, "required");
 863       assert_different_registers(dst, src, tmp);
 864       movdqu(xmm0, dst);
 865       pcmpgtq(xmm0, src);
 866       blendvpd(dst, src);  // xmm0 as mask
 867     }
 868   } else { // opcode == Op_MaxV
 869     if (elem_bt == T_BYTE) {
 870       pmaxsb(dst, src);
 871     } else if (elem_bt == T_SHORT) {
 872       pmaxsw(dst, src);
 873     } else if (elem_bt == T_INT) {
 874       pmaxsd(dst, src);
 875     } else {
 876       assert(elem_bt == T_LONG, "required");
 877       assert(tmp == xmm0, "required");
 878       assert_different_registers(dst, src, tmp);
 879       movdqu(xmm0, src);
 880       pcmpgtq(xmm0, dst);
 881       blendvpd(dst, src);  // xmm0 as mask
 882     }
 883   }
 884 }
 885 
 886 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 887                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 888                                  int vlen_enc) {
 889   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 890 
 891   if (opcode == Op_MinV) {
 892     if (elem_bt == T_BYTE) {
 893       vpminsb(dst, src1, src2, vlen_enc);
 894     } else if (elem_bt == T_SHORT) {
 895       vpminsw(dst, src1, src2, vlen_enc);
 896     } else if (elem_bt == T_INT) {
 897       vpminsd(dst, src1, src2, vlen_enc);
 898     } else {
 899       assert(elem_bt == T_LONG, "required");
 900       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 901         vpminsq(dst, src1, src2, vlen_enc);
 902       } else {
 903         assert_different_registers(dst, src1, src2);
 904         vpcmpgtq(dst, src1, src2, vlen_enc);
 905         vblendvpd(dst, src1, src2, dst, vlen_enc);
 906       }
 907     }
 908   } else { // opcode == Op_MaxV
 909     if (elem_bt == T_BYTE) {
 910       vpmaxsb(dst, src1, src2, vlen_enc);
 911     } else if (elem_bt == T_SHORT) {
 912       vpmaxsw(dst, src1, src2, vlen_enc);
 913     } else if (elem_bt == T_INT) {
 914       vpmaxsd(dst, src1, src2, vlen_enc);
 915     } else {
 916       assert(elem_bt == T_LONG, "required");
 917       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 918         vpmaxsq(dst, src1, src2, vlen_enc);
 919       } else {
 920         assert_different_registers(dst, src1, src2);
 921         vpcmpgtq(dst, src1, src2, vlen_enc);
 922         vblendvpd(dst, src2, src1, dst, vlen_enc);
 923       }
 924     }
 925   }
 926 }
 927 
 928 // Float/Double min max
 929 
 930 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 931                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 932                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 933                                    int vlen_enc) {
 934   assert(UseAVX > 0, "required");
 935   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 936          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 937   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 938   assert_different_registers(a, b, tmp, atmp, btmp);
 939 
 940   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 941   bool is_double_word = is_double_word_type(elem_bt);
 942 
 943   if (!is_double_word && is_min) {
 944     vblendvps(atmp, a, b, a, vlen_enc);
 945     vblendvps(btmp, b, a, a, vlen_enc);
 946     vminps(tmp, atmp, btmp, vlen_enc);
 947     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 948     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
 949   } else if (!is_double_word && !is_min) {
 950     vblendvps(btmp, b, a, b, vlen_enc);
 951     vblendvps(atmp, a, b, b, vlen_enc);
 952     vmaxps(tmp, atmp, btmp, vlen_enc);
 953     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 954     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
 955   } else if (is_double_word && is_min) {
 956     vblendvpd(atmp, a, b, a, vlen_enc);
 957     vblendvpd(btmp, b, a, a, vlen_enc);
 958     vminpd(tmp, atmp, btmp, vlen_enc);
 959     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 960     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
 961   } else {
 962     assert(is_double_word && !is_min, "sanity");
 963     vblendvpd(btmp, b, a, b, vlen_enc);
 964     vblendvpd(atmp, a, b, b, vlen_enc);
 965     vmaxpd(tmp, atmp, btmp, vlen_enc);
 966     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 967     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
 968   }
 969 }
 970 
 971 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
 972                                     XMMRegister dst, XMMRegister a, XMMRegister b,
 973                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
 974                                     int vlen_enc) {
 975   assert(UseAVX > 2, "required");
 976   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 977          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 978   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 979   assert_different_registers(dst, a, b, atmp, btmp);
 980 
 981   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 982   bool is_double_word = is_double_word_type(elem_bt);
 983   bool merge = true;
 984 
 985   if (!is_double_word && is_min) {
 986     evpmovd2m(ktmp, a, vlen_enc);
 987     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
 988     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
 989     vminps(dst, atmp, btmp, vlen_enc);
 990     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 991     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
 992   } else if (!is_double_word && !is_min) {
 993     evpmovd2m(ktmp, b, vlen_enc);
 994     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
 995     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
 996     vmaxps(dst, atmp, btmp, vlen_enc);
 997     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 998     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
 999   } else if (is_double_word && is_min) {
1000     evpmovq2m(ktmp, a, vlen_enc);
1001     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1002     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1003     vminpd(dst, atmp, btmp, vlen_enc);
1004     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1005     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1006   } else {
1007     assert(is_double_word && !is_min, "sanity");
1008     evpmovq2m(ktmp, b, vlen_enc);
1009     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1010     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1011     vmaxpd(dst, atmp, btmp, vlen_enc);
1012     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1013     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1014   }
1015 }
1016 
1017 // Float/Double signum
1018 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst,
1019                                   XMMRegister zero, XMMRegister one,
1020                                   Register scratch) {
1021   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1022 
1023   Label DONE_LABEL;
1024 
1025   if (opcode == Op_SignumF) {
1026     assert(UseSSE > 0, "required");
1027     ucomiss(dst, zero);
1028     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1029     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1030     movflt(dst, one);
1031     jcc(Assembler::above, DONE_LABEL);
1032     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scratch);
1033   } else if (opcode == Op_SignumD) {
1034     assert(UseSSE > 1, "required");
1035     ucomisd(dst, zero);
1036     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1037     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1038     movdbl(dst, one);
1039     jcc(Assembler::above, DONE_LABEL);
1040     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scratch);
1041   }
1042 
1043   bind(DONE_LABEL);
1044 }
1045 
1046 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1047   if (sign) {
1048     pmovsxbw(dst, src);
1049   } else {
1050     pmovzxbw(dst, src);
1051   }
1052 }
1053 
1054 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1055   if (sign) {
1056     vpmovsxbw(dst, src, vector_len);
1057   } else {
1058     vpmovzxbw(dst, src, vector_len);
1059   }
1060 }
1061 
1062 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1063   if (sign) {
1064     vpmovsxbd(dst, src, vector_len);
1065   } else {
1066     vpmovzxbd(dst, src, vector_len);
1067   }
1068 }
1069 
1070 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1071   if (sign) {
1072     vpmovsxwd(dst, src, vector_len);
1073   } else {
1074     vpmovzxwd(dst, src, vector_len);
1075   }
1076 }
1077 
1078 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1079                                      int shift, int vector_len) {
1080   if (opcode == Op_RotateLeftV) {
1081     if (etype == T_INT) {
1082       evprold(dst, src, shift, vector_len);
1083     } else {
1084       assert(etype == T_LONG, "expected type T_LONG");
1085       evprolq(dst, src, shift, vector_len);
1086     }
1087   } else {
1088     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1089     if (etype == T_INT) {
1090       evprord(dst, src, shift, vector_len);
1091     } else {
1092       assert(etype == T_LONG, "expected type T_LONG");
1093       evprorq(dst, src, shift, vector_len);
1094     }
1095   }
1096 }
1097 
1098 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1099                                      XMMRegister shift, int vector_len) {
1100   if (opcode == Op_RotateLeftV) {
1101     if (etype == T_INT) {
1102       evprolvd(dst, src, shift, vector_len);
1103     } else {
1104       assert(etype == T_LONG, "expected type T_LONG");
1105       evprolvq(dst, src, shift, vector_len);
1106     }
1107   } else {
1108     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1109     if (etype == T_INT) {
1110       evprorvd(dst, src, shift, vector_len);
1111     } else {
1112       assert(etype == T_LONG, "expected type T_LONG");
1113       evprorvq(dst, src, shift, vector_len);
1114     }
1115   }
1116 }
1117 
1118 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1119   if (opcode == Op_RShiftVI) {
1120     psrad(dst, shift);
1121   } else if (opcode == Op_LShiftVI) {
1122     pslld(dst, shift);
1123   } else {
1124     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1125     psrld(dst, shift);
1126   }
1127 }
1128 
1129 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1130   switch (opcode) {
1131     case Op_RShiftVI:  psrad(dst, shift); break;
1132     case Op_LShiftVI:  pslld(dst, shift); break;
1133     case Op_URShiftVI: psrld(dst, shift); break;
1134 
1135     default: assert(false, "%s", NodeClassNames[opcode]);
1136   }
1137 }
1138 
1139 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1140   if (opcode == Op_RShiftVI) {
1141     vpsrad(dst, nds, shift, vector_len);
1142   } else if (opcode == Op_LShiftVI) {
1143     vpslld(dst, nds, shift, vector_len);
1144   } else {
1145     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1146     vpsrld(dst, nds, shift, vector_len);
1147   }
1148 }
1149 
1150 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1151   switch (opcode) {
1152     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1153     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1154     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1155 
1156     default: assert(false, "%s", NodeClassNames[opcode]);
1157   }
1158 }
1159 
1160 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1161   switch (opcode) {
1162     case Op_RShiftVB:  // fall-through
1163     case Op_RShiftVS:  psraw(dst, shift); break;
1164 
1165     case Op_LShiftVB:  // fall-through
1166     case Op_LShiftVS:  psllw(dst, shift);   break;
1167 
1168     case Op_URShiftVS: // fall-through
1169     case Op_URShiftVB: psrlw(dst, shift);  break;
1170 
1171     default: assert(false, "%s", NodeClassNames[opcode]);
1172   }
1173 }
1174 
1175 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1176   switch (opcode) {
1177     case Op_RShiftVB:  // fall-through
1178     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1179 
1180     case Op_LShiftVB:  // fall-through
1181     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1182 
1183     case Op_URShiftVS: // fall-through
1184     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1185 
1186     default: assert(false, "%s", NodeClassNames[opcode]);
1187   }
1188 }
1189 
1190 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1191   switch (opcode) {
1192     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1193     case Op_LShiftVL:  psllq(dst, shift); break;
1194     case Op_URShiftVL: psrlq(dst, shift); break;
1195 
1196     default: assert(false, "%s", NodeClassNames[opcode]);
1197   }
1198 }
1199 
1200 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1201   if (opcode == Op_RShiftVL) {
1202     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1203   } else if (opcode == Op_LShiftVL) {
1204     psllq(dst, shift);
1205   } else {
1206     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1207     psrlq(dst, shift);
1208   }
1209 }
1210 
1211 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1212   switch (opcode) {
1213     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1214     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1215     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1216 
1217     default: assert(false, "%s", NodeClassNames[opcode]);
1218   }
1219 }
1220 
1221 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1222   if (opcode == Op_RShiftVL) {
1223     evpsraq(dst, nds, shift, vector_len);
1224   } else if (opcode == Op_LShiftVL) {
1225     vpsllq(dst, nds, shift, vector_len);
1226   } else {
1227     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1228     vpsrlq(dst, nds, shift, vector_len);
1229   }
1230 }
1231 
1232 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1233   switch (opcode) {
1234     case Op_RShiftVB:  // fall-through
1235     case Op_RShiftVS:  // fall-through
1236     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1237 
1238     case Op_LShiftVB:  // fall-through
1239     case Op_LShiftVS:  // fall-through
1240     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1241 
1242     case Op_URShiftVB: // fall-through
1243     case Op_URShiftVS: // fall-through
1244     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1245 
1246     default: assert(false, "%s", NodeClassNames[opcode]);
1247   }
1248 }
1249 
1250 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1251   switch (opcode) {
1252     case Op_RShiftVB:  // fall-through
1253     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1254 
1255     case Op_LShiftVB:  // fall-through
1256     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1257 
1258     case Op_URShiftVB: // fall-through
1259     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1260 
1261     default: assert(false, "%s", NodeClassNames[opcode]);
1262   }
1263 }
1264 
1265 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1266   assert(UseAVX >= 2, "required");
1267   switch (opcode) {
1268     case Op_RShiftVL: {
1269       if (UseAVX > 2) {
1270         assert(tmp == xnoreg, "not used");
1271         if (!VM_Version::supports_avx512vl()) {
1272           vlen_enc = Assembler::AVX_512bit;
1273         }
1274         evpsravq(dst, src, shift, vlen_enc);
1275       } else {
1276         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1277         vpsrlvq(dst, src, shift, vlen_enc);
1278         vpsrlvq(tmp, tmp, shift, vlen_enc);
1279         vpxor(dst, dst, tmp, vlen_enc);
1280         vpsubq(dst, dst, tmp, vlen_enc);
1281       }
1282       break;
1283     }
1284     case Op_LShiftVL: {
1285       assert(tmp == xnoreg, "not used");
1286       vpsllvq(dst, src, shift, vlen_enc);
1287       break;
1288     }
1289     case Op_URShiftVL: {
1290       assert(tmp == xnoreg, "not used");
1291       vpsrlvq(dst, src, shift, vlen_enc);
1292       break;
1293     }
1294     default: assert(false, "%s", NodeClassNames[opcode]);
1295   }
1296 }
1297 
1298 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1299 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1300   assert(opcode == Op_LShiftVB ||
1301          opcode == Op_RShiftVB ||
1302          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1303   bool sign = (opcode != Op_URShiftVB);
1304   assert(vector_len == 0, "required");
1305   vextendbd(sign, dst, src, 1);
1306   vpmovzxbd(vtmp, shift, 1);
1307   varshiftd(opcode, dst, dst, vtmp, 1);
1308   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch);
1309   vextracti128_high(vtmp, dst);
1310   vpackusdw(dst, dst, vtmp, 0);
1311 }
1312 
1313 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1314 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1315   assert(opcode == Op_LShiftVB ||
1316          opcode == Op_RShiftVB ||
1317          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1318   bool sign = (opcode != Op_URShiftVB);
1319   int ext_vector_len = vector_len + 1;
1320   vextendbw(sign, dst, src, ext_vector_len);
1321   vpmovzxbw(vtmp, shift, ext_vector_len);
1322   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1323   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch);
1324   if (vector_len == 0) {
1325     vextracti128_high(vtmp, dst);
1326     vpackuswb(dst, dst, vtmp, vector_len);
1327   } else {
1328     vextracti64x4_high(vtmp, dst);
1329     vpackuswb(dst, dst, vtmp, vector_len);
1330     vpermq(dst, dst, 0xD8, vector_len);
1331   }
1332 }
1333 
1334 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1335   switch(typ) {
1336     case T_BYTE:
1337       pinsrb(dst, val, idx);
1338       break;
1339     case T_SHORT:
1340       pinsrw(dst, val, idx);
1341       break;
1342     case T_INT:
1343       pinsrd(dst, val, idx);
1344       break;
1345     case T_LONG:
1346       pinsrq(dst, val, idx);
1347       break;
1348     default:
1349       assert(false,"Should not reach here.");
1350       break;
1351   }
1352 }
1353 
1354 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1355   switch(typ) {
1356     case T_BYTE:
1357       vpinsrb(dst, src, val, idx);
1358       break;
1359     case T_SHORT:
1360       vpinsrw(dst, src, val, idx);
1361       break;
1362     case T_INT:
1363       vpinsrd(dst, src, val, idx);
1364       break;
1365     case T_LONG:
1366       vpinsrq(dst, src, val, idx);
1367       break;
1368     default:
1369       assert(false,"Should not reach here.");
1370       break;
1371   }
1372 }
1373 
1374 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1375   switch(typ) {
1376     case T_INT:
1377       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1378       break;
1379     case T_FLOAT:
1380       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1381       break;
1382     case T_LONG:
1383       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1384       break;
1385     case T_DOUBLE:
1386       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1387       break;
1388     default:
1389       assert(false,"Should not reach here.");
1390       break;
1391   }
1392 }
1393 
1394 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1395   switch(typ) {
1396     case T_INT:
1397       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1398       break;
1399     case T_FLOAT:
1400       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1401       break;
1402     case T_LONG:
1403       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1404       break;
1405     case T_DOUBLE:
1406       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1407       break;
1408     default:
1409       assert(false,"Should not reach here.");
1410       break;
1411   }
1412 }
1413 
1414 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1415   switch(typ) {
1416     case T_INT:
1417       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1418       break;
1419     case T_FLOAT:
1420       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1421       break;
1422     case T_LONG:
1423       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1424       break;
1425     case T_DOUBLE:
1426       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1427       break;
1428     default:
1429       assert(false,"Should not reach here.");
1430       break;
1431   }
1432 }
1433 
1434 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1435   if (vlen_in_bytes <= 16) {
1436     pxor (dst, dst);
1437     psubb(dst, src);
1438     switch (elem_bt) {
1439       case T_BYTE:   /* nothing to do */ break;
1440       case T_SHORT:  pmovsxbw(dst, dst); break;
1441       case T_INT:    pmovsxbd(dst, dst); break;
1442       case T_FLOAT:  pmovsxbd(dst, dst); break;
1443       case T_LONG:   pmovsxbq(dst, dst); break;
1444       case T_DOUBLE: pmovsxbq(dst, dst); break;
1445 
1446       default: assert(false, "%s", type2name(elem_bt));
1447     }
1448   } else {
1449     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1450     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1451 
1452     vpxor (dst, dst, dst, vlen_enc);
1453     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1454 
1455     switch (elem_bt) {
1456       case T_BYTE:   /* nothing to do */            break;
1457       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1458       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1459       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1460       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1461       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1462 
1463       default: assert(false, "%s", type2name(elem_bt));
1464     }
1465   }
1466 }
1467 
1468 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) {
1469   ExternalAddress addr(StubRoutines::x86::vector_iota_indices());
1470   if (vlen_in_bytes == 4) {
1471     movdl(dst, addr);
1472   } else if (vlen_in_bytes == 8) {
1473     movq(dst, addr);
1474   } else if (vlen_in_bytes == 16) {
1475     movdqu(dst, addr, scratch);
1476   } else if (vlen_in_bytes == 32) {
1477     vmovdqu(dst, addr, scratch);
1478   } else {
1479     assert(vlen_in_bytes == 64, "%d", vlen_in_bytes);
1480     evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch);
1481   }
1482 }
1483 
1484 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1485 
1486 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1487   int vector_len = Assembler::AVX_128bit;
1488 
1489   switch (opcode) {
1490     case Op_AndReductionV:  pand(dst, src); break;
1491     case Op_OrReductionV:   por (dst, src); break;
1492     case Op_XorReductionV:  pxor(dst, src); break;
1493     case Op_MinReductionV:
1494       switch (typ) {
1495         case T_BYTE:        pminsb(dst, src); break;
1496         case T_SHORT:       pminsw(dst, src); break;
1497         case T_INT:         pminsd(dst, src); break;
1498         case T_LONG:        assert(UseAVX > 2, "required");
1499                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1500         default:            assert(false, "wrong type");
1501       }
1502       break;
1503     case Op_MaxReductionV:
1504       switch (typ) {
1505         case T_BYTE:        pmaxsb(dst, src); break;
1506         case T_SHORT:       pmaxsw(dst, src); break;
1507         case T_INT:         pmaxsd(dst, src); break;
1508         case T_LONG:        assert(UseAVX > 2, "required");
1509                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1510         default:            assert(false, "wrong type");
1511       }
1512       break;
1513     case Op_AddReductionVF: addss(dst, src); break;
1514     case Op_AddReductionVD: addsd(dst, src); break;
1515     case Op_AddReductionVI:
1516       switch (typ) {
1517         case T_BYTE:        paddb(dst, src); break;
1518         case T_SHORT:       paddw(dst, src); break;
1519         case T_INT:         paddd(dst, src); break;
1520         default:            assert(false, "wrong type");
1521       }
1522       break;
1523     case Op_AddReductionVL: paddq(dst, src); break;
1524     case Op_MulReductionVF: mulss(dst, src); break;
1525     case Op_MulReductionVD: mulsd(dst, src); break;
1526     case Op_MulReductionVI:
1527       switch (typ) {
1528         case T_SHORT:       pmullw(dst, src); break;
1529         case T_INT:         pmulld(dst, src); break;
1530         default:            assert(false, "wrong type");
1531       }
1532       break;
1533     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1534                             vpmullq(dst, dst, src, vector_len); break;
1535     default:                assert(false, "wrong opcode");
1536   }
1537 }
1538 
1539 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1540   int vector_len = Assembler::AVX_256bit;
1541 
1542   switch (opcode) {
1543     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1544     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1545     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1546     case Op_MinReductionV:
1547       switch (typ) {
1548         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1549         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1550         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1551         case T_LONG:        assert(UseAVX > 2, "required");
1552                             vpminsq(dst, src1, src2, vector_len); break;
1553         default:            assert(false, "wrong type");
1554       }
1555       break;
1556     case Op_MaxReductionV:
1557       switch (typ) {
1558         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1559         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1560         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1561         case T_LONG:        assert(UseAVX > 2, "required");
1562                             vpmaxsq(dst, src1, src2, vector_len); break;
1563         default:            assert(false, "wrong type");
1564       }
1565       break;
1566     case Op_AddReductionVI:
1567       switch (typ) {
1568         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1569         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1570         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1571         default:            assert(false, "wrong type");
1572       }
1573       break;
1574     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1575     case Op_MulReductionVI:
1576       switch (typ) {
1577         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1578         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1579         default:            assert(false, "wrong type");
1580       }
1581       break;
1582     case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
1583     default:                assert(false, "wrong opcode");
1584   }
1585 }
1586 
1587 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1588                                   XMMRegister dst, XMMRegister src,
1589                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1590   switch (opcode) {
1591     case Op_AddReductionVF:
1592     case Op_MulReductionVF:
1593       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1594       break;
1595 
1596     case Op_AddReductionVD:
1597     case Op_MulReductionVD:
1598       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1599       break;
1600 
1601     default: assert(false, "wrong opcode");
1602   }
1603 }
1604 
1605 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1606                              Register dst, Register src1, XMMRegister src2,
1607                              XMMRegister vtmp1, XMMRegister vtmp2) {
1608   switch (vlen) {
1609     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1610     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1611     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1612     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1613 
1614     default: assert(false, "wrong vector length");
1615   }
1616 }
1617 
1618 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1619                              Register dst, Register src1, XMMRegister src2,
1620                              XMMRegister vtmp1, XMMRegister vtmp2) {
1621   switch (vlen) {
1622     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1623     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1624     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1625     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1626 
1627     default: assert(false, "wrong vector length");
1628   }
1629 }
1630 
1631 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1632                              Register dst, Register src1, XMMRegister src2,
1633                              XMMRegister vtmp1, XMMRegister vtmp2) {
1634   switch (vlen) {
1635     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1636     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1637     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1638     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1639 
1640     default: assert(false, "wrong vector length");
1641   }
1642 }
1643 
1644 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1645                              Register dst, Register src1, XMMRegister src2,
1646                              XMMRegister vtmp1, XMMRegister vtmp2) {
1647   switch (vlen) {
1648     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1649     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1650     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1651     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1652 
1653     default: assert(false, "wrong vector length");
1654   }
1655 }
1656 
1657 #ifdef _LP64
1658 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1659                              Register dst, Register src1, XMMRegister src2,
1660                              XMMRegister vtmp1, XMMRegister vtmp2) {
1661   switch (vlen) {
1662     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1663     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1664     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1665 
1666     default: assert(false, "wrong vector length");
1667   }
1668 }
1669 #endif // _LP64
1670 
1671 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1672   switch (vlen) {
1673     case 2:
1674       assert(vtmp2 == xnoreg, "");
1675       reduce2F(opcode, dst, src, vtmp1);
1676       break;
1677     case 4:
1678       assert(vtmp2 == xnoreg, "");
1679       reduce4F(opcode, dst, src, vtmp1);
1680       break;
1681     case 8:
1682       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1683       break;
1684     case 16:
1685       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1686       break;
1687     default: assert(false, "wrong vector length");
1688   }
1689 }
1690 
1691 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1692   switch (vlen) {
1693     case 2:
1694       assert(vtmp2 == xnoreg, "");
1695       reduce2D(opcode, dst, src, vtmp1);
1696       break;
1697     case 4:
1698       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1699       break;
1700     case 8:
1701       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1702       break;
1703     default: assert(false, "wrong vector length");
1704   }
1705 }
1706 
1707 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1708   if (opcode == Op_AddReductionVI) {
1709     if (vtmp1 != src2) {
1710       movdqu(vtmp1, src2);
1711     }
1712     phaddd(vtmp1, vtmp1);
1713   } else {
1714     pshufd(vtmp1, src2, 0x1);
1715     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1716   }
1717   movdl(vtmp2, src1);
1718   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1719   movdl(dst, vtmp1);
1720 }
1721 
1722 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1723   if (opcode == Op_AddReductionVI) {
1724     if (vtmp1 != src2) {
1725       movdqu(vtmp1, src2);
1726     }
1727     phaddd(vtmp1, src2);
1728     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1729   } else {
1730     pshufd(vtmp2, src2, 0xE);
1731     reduce_operation_128(T_INT, opcode, vtmp2, src2);
1732     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1733   }
1734 }
1735 
1736 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1737   if (opcode == Op_AddReductionVI) {
1738     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1739     vextracti128_high(vtmp2, vtmp1);
1740     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1741     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1742   } else {
1743     vextracti128_high(vtmp1, src2);
1744     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1745     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1746   }
1747 }
1748 
1749 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1750   vextracti64x4_high(vtmp2, src2);
1751   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
1752   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1753 }
1754 
1755 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1756   pshufd(vtmp2, src2, 0x1);
1757   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1758   movdqu(vtmp1, vtmp2);
1759   psrldq(vtmp1, 2);
1760   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1761   movdqu(vtmp2, vtmp1);
1762   psrldq(vtmp2, 1);
1763   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1764   movdl(vtmp2, src1);
1765   pmovsxbd(vtmp1, vtmp1);
1766   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1767   pextrb(dst, vtmp1, 0x0);
1768   movsbl(dst, dst);
1769 }
1770 
1771 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1772   pshufd(vtmp1, src2, 0xE);
1773   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
1774   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1775 }
1776 
1777 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1778   vextracti128_high(vtmp2, src2);
1779   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1780   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1781 }
1782 
1783 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1784   vextracti64x4_high(vtmp1, src2);
1785   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
1786   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1787 }
1788 
1789 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1790   pmovsxbw(vtmp2, src2);
1791   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1792 }
1793 
1794 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1795   if (UseAVX > 1) {
1796     int vector_len = Assembler::AVX_256bit;
1797     vpmovsxbw(vtmp1, src2, vector_len);
1798     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1799   } else {
1800     pmovsxbw(vtmp2, src2);
1801     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1802     pshufd(vtmp2, src2, 0x1);
1803     pmovsxbw(vtmp2, src2);
1804     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1805   }
1806 }
1807 
1808 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1809   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
1810     int vector_len = Assembler::AVX_512bit;
1811     vpmovsxbw(vtmp1, src2, vector_len);
1812     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1813   } else {
1814     assert(UseAVX >= 2,"Should not reach here.");
1815     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
1816     vextracti128_high(vtmp2, src2);
1817     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1818   }
1819 }
1820 
1821 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1822   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
1823   vextracti64x4_high(vtmp2, src2);
1824   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1825 }
1826 
1827 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1828   if (opcode == Op_AddReductionVI) {
1829     if (vtmp1 != src2) {
1830       movdqu(vtmp1, src2);
1831     }
1832     phaddw(vtmp1, vtmp1);
1833     phaddw(vtmp1, vtmp1);
1834   } else {
1835     pshufd(vtmp2, src2, 0x1);
1836     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1837     movdqu(vtmp1, vtmp2);
1838     psrldq(vtmp1, 2);
1839     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
1840   }
1841   movdl(vtmp2, src1);
1842   pmovsxwd(vtmp1, vtmp1);
1843   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1844   pextrw(dst, vtmp1, 0x0);
1845   movswl(dst, dst);
1846 }
1847 
1848 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1849   if (opcode == Op_AddReductionVI) {
1850     if (vtmp1 != src2) {
1851       movdqu(vtmp1, src2);
1852     }
1853     phaddw(vtmp1, src2);
1854   } else {
1855     pshufd(vtmp1, src2, 0xE);
1856     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
1857   }
1858   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1859 }
1860 
1861 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1862   if (opcode == Op_AddReductionVI) {
1863     int vector_len = Assembler::AVX_256bit;
1864     vphaddw(vtmp2, src2, src2, vector_len);
1865     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
1866   } else {
1867     vextracti128_high(vtmp2, src2);
1868     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1869   }
1870   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1871 }
1872 
1873 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1874   int vector_len = Assembler::AVX_256bit;
1875   vextracti64x4_high(vtmp1, src2);
1876   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
1877   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1878 }
1879 
1880 #ifdef _LP64
1881 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1882   pshufd(vtmp2, src2, 0xE);
1883   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
1884   movdq(vtmp1, src1);
1885   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
1886   movdq(dst, vtmp1);
1887 }
1888 
1889 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1890   vextracti128_high(vtmp1, src2);
1891   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
1892   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1893 }
1894 
1895 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1896   vextracti64x4_high(vtmp2, src2);
1897   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
1898   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1899 }
1900 
1901 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
1902   assert(ArrayOperationPartialInlineSize > 0 && ArrayOperationPartialInlineSize <= 64, "invalid");
1903   mov64(temp, -1L);
1904   bzhiq(temp, temp, len);
1905   kmovql(dst, temp);
1906 }
1907 #endif // _LP64
1908 
1909 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1910   reduce_operation_128(T_FLOAT, opcode, dst, src);
1911   pshufd(vtmp, src, 0x1);
1912   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1913 }
1914 
1915 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1916   reduce2F(opcode, dst, src, vtmp);
1917   pshufd(vtmp, src, 0x2);
1918   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1919   pshufd(vtmp, src, 0x3);
1920   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1921 }
1922 
1923 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1924   reduce4F(opcode, dst, src, vtmp2);
1925   vextractf128_high(vtmp2, src);
1926   reduce4F(opcode, dst, vtmp2, vtmp1);
1927 }
1928 
1929 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1930   reduce8F(opcode, dst, src, vtmp1, vtmp2);
1931   vextracti64x4_high(vtmp1, src);
1932   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
1933 }
1934 
1935 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1936   reduce_operation_128(T_DOUBLE, opcode, dst, src);
1937   pshufd(vtmp, src, 0xE);
1938   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
1939 }
1940 
1941 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1942   reduce2D(opcode, dst, src, vtmp2);
1943   vextractf128_high(vtmp2, src);
1944   reduce2D(opcode, dst, vtmp2, vtmp1);
1945 }
1946 
1947 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1948   reduce4D(opcode, dst, src, vtmp1, vtmp2);
1949   vextracti64x4_high(vtmp1, src);
1950   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
1951 }
1952 
1953 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) {
1954   MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
1955 }
1956 
1957 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) {
1958   MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
1959 }
1960 
1961 
1962 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
1963                                           XMMRegister dst, XMMRegister src,
1964                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1965                                           XMMRegister xmm_0, XMMRegister xmm_1) {
1966   int permconst[] = {1, 14};
1967   XMMRegister wsrc = src;
1968   XMMRegister wdst = xmm_0;
1969   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
1970 
1971   int vlen_enc = Assembler::AVX_128bit;
1972   if (vlen == 16) {
1973     vlen_enc = Assembler::AVX_256bit;
1974   }
1975 
1976   for (int i = log2(vlen) - 1; i >=0; i--) {
1977     if (i == 0 && !is_dst_valid) {
1978       wdst = dst;
1979     }
1980     if (i == 3) {
1981       vextracti64x4_high(wtmp, wsrc);
1982     } else if (i == 2) {
1983       vextracti128_high(wtmp, wsrc);
1984     } else { // i = [0,1]
1985       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
1986     }
1987     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
1988     wsrc = wdst;
1989     vlen_enc = Assembler::AVX_128bit;
1990   }
1991   if (is_dst_valid) {
1992     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
1993   }
1994 }
1995 
1996 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
1997                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1998                                         XMMRegister xmm_0, XMMRegister xmm_1) {
1999   XMMRegister wsrc = src;
2000   XMMRegister wdst = xmm_0;
2001   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2002   int vlen_enc = Assembler::AVX_128bit;
2003   if (vlen == 8) {
2004     vlen_enc = Assembler::AVX_256bit;
2005   }
2006   for (int i = log2(vlen) - 1; i >=0; i--) {
2007     if (i == 0 && !is_dst_valid) {
2008       wdst = dst;
2009     }
2010     if (i == 1) {
2011       vextracti128_high(wtmp, wsrc);
2012     } else if (i == 2) {
2013       vextracti64x4_high(wtmp, wsrc);
2014     } else {
2015       assert(i == 0, "%d", i);
2016       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2017     }
2018     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2019     wsrc = wdst;
2020     vlen_enc = Assembler::AVX_128bit;
2021   }
2022   if (is_dst_valid) {
2023     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2024   }
2025 }
2026 
2027 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2028   switch (bt) {
2029     case T_BYTE:  pextrb(dst, src, idx); break;
2030     case T_SHORT: pextrw(dst, src, idx); break;
2031     case T_INT:   pextrd(dst, src, idx); break;
2032     case T_LONG:  pextrq(dst, src, idx); break;
2033 
2034     default:
2035       assert(false,"Should not reach here.");
2036       break;
2037   }
2038 }
2039 
2040 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2041   int esize =  type2aelembytes(typ);
2042   int elem_per_lane = 16/esize;
2043   int lane = elemindex / elem_per_lane;
2044   int eindex = elemindex % elem_per_lane;
2045 
2046   if (lane >= 2) {
2047     assert(UseAVX > 2, "required");
2048     vextractf32x4(dst, src, lane & 3);
2049     return dst;
2050   } else if (lane > 0) {
2051     assert(UseAVX > 0, "required");
2052     vextractf128(dst, src, lane);
2053     return dst;
2054   } else {
2055     return src;
2056   }
2057 }
2058 
2059 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2060   int esize =  type2aelembytes(typ);
2061   int elem_per_lane = 16/esize;
2062   int eindex = elemindex % elem_per_lane;
2063   assert(is_integral_type(typ),"required");
2064 
2065   if (eindex == 0) {
2066     if (typ == T_LONG) {
2067       movq(dst, src);
2068     } else {
2069       movdl(dst, src);
2070       if (typ == T_BYTE)
2071         movsbl(dst, dst);
2072       else if (typ == T_SHORT)
2073         movswl(dst, dst);
2074     }
2075   } else {
2076     extract(typ, dst, src, eindex);
2077   }
2078 }
2079 
2080 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) {
2081   int esize =  type2aelembytes(typ);
2082   int elem_per_lane = 16/esize;
2083   int eindex = elemindex % elem_per_lane;
2084   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2085 
2086   if (eindex == 0) {
2087     movq(dst, src);
2088   } else {
2089     if (typ == T_FLOAT) {
2090       if (UseAVX == 0) {
2091         movdqu(dst, src);
2092         pshufps(dst, dst, eindex);
2093       } else {
2094         vpshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2095       }
2096     } else {
2097       if (UseAVX == 0) {
2098         movdqu(dst, src);
2099         psrldq(dst, eindex*esize);
2100       } else {
2101         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2102       }
2103       movq(dst, dst);
2104     }
2105   }
2106   // Zero upper bits
2107   if (typ == T_FLOAT) {
2108     if (UseAVX == 0) {
2109       assert((vtmp != xnoreg) && (tmp != noreg), "required.");
2110       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp);
2111       pand(dst, vtmp);
2112     } else {
2113       assert((tmp != noreg), "required.");
2114       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp);
2115     }
2116   }
2117 }
2118 
2119 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2120   switch(typ) {
2121     case T_BYTE:
2122     case T_BOOLEAN:
2123       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2124       break;
2125     case T_SHORT:
2126     case T_CHAR:
2127       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2128       break;
2129     case T_INT:
2130     case T_FLOAT:
2131       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2132       break;
2133     case T_LONG:
2134     case T_DOUBLE:
2135       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2136       break;
2137     default:
2138       assert(false,"Should not reach here.");
2139       break;
2140   }
2141 }
2142 
2143 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) {
2144   switch(typ) {
2145     case T_BOOLEAN:
2146     case T_BYTE:
2147       evpcmpb(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2148       break;
2149     case T_CHAR:
2150     case T_SHORT:
2151       evpcmpw(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2152       break;
2153     case T_INT:
2154     case T_FLOAT:
2155       evpcmpd(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2156       break;
2157     case T_LONG:
2158     case T_DOUBLE:
2159       evpcmpq(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2160       break;
2161     default:
2162       assert(false,"Should not reach here.");
2163       break;
2164   }
2165 }
2166 
2167 void C2_MacroAssembler::vpcmpu(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison,
2168                             int vlen_in_bytes, XMMRegister vtmp1, XMMRegister vtmp2, Register scratch) {
2169   int vlen_enc = vector_length_encoding(vlen_in_bytes*2);
2170   switch (typ) {
2171   case T_BYTE:
2172     vpmovzxbw(vtmp1, src1, vlen_enc);
2173     vpmovzxbw(vtmp2, src2, vlen_enc);
2174     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2175     vpacksswb(dst, dst, dst, vlen_enc);
2176     break;
2177   case T_SHORT:
2178     vpmovzxwd(vtmp1, src1, vlen_enc);
2179     vpmovzxwd(vtmp2, src2, vlen_enc);
2180     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);
2181     vpackssdw(dst, dst, dst, vlen_enc);
2182     break;
2183   case T_INT:
2184     vpmovzxdq(vtmp1, src1, vlen_enc);
2185     vpmovzxdq(vtmp2, src2, vlen_enc);
2186     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);
2187     vpermilps(dst, dst, 8, vlen_enc);
2188     break;
2189   default:
2190     assert(false, "Should not reach here");
2191   }
2192   if (vlen_in_bytes == 16) {
2193     vpermpd(dst, dst, 0x8, vlen_enc);
2194   }
2195 }
2196 
2197 void C2_MacroAssembler::vpcmpu32(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison, int vlen_in_bytes,
2198                               XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, Register scratch) {
2199   int vlen_enc = vector_length_encoding(vlen_in_bytes);
2200   switch (typ) {
2201   case T_BYTE:
2202     vpmovzxbw(vtmp1, src1, vlen_enc);
2203     vpmovzxbw(vtmp2, src2, vlen_enc);
2204     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2205     vextracti128(vtmp1, src1, 1);
2206     vextracti128(vtmp2, src2, 1);
2207     vpmovzxbw(vtmp1, vtmp1, vlen_enc);
2208     vpmovzxbw(vtmp2, vtmp2, vlen_enc);
2209     vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2210     vpacksswb(dst, dst, vtmp3, vlen_enc);
2211     vpermpd(dst, dst, 0xd8, vlen_enc);
2212     break;
2213   case T_SHORT:
2214     vpmovzxwd(vtmp1, src1, vlen_enc);
2215     vpmovzxwd(vtmp2, src2, vlen_enc);
2216     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);
2217     vextracti128(vtmp1, src1, 1);
2218     vextracti128(vtmp2, src2, 1);
2219     vpmovzxwd(vtmp1, vtmp1, vlen_enc);
2220     vpmovzxwd(vtmp2, vtmp2, vlen_enc);
2221     vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::D,  vlen_enc, scratch);
2222     vpackssdw(dst, dst, vtmp3, vlen_enc);
2223     vpermpd(dst, dst, 0xd8, vlen_enc);
2224     break;
2225   case T_INT:
2226     vpmovzxdq(vtmp1, src1, vlen_enc);
2227     vpmovzxdq(vtmp2, src2, vlen_enc);
2228     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);
2229     vpshufd(dst, dst, 8, vlen_enc);
2230     vpermq(dst, dst, 8, vlen_enc);
2231     vextracti128(vtmp1, src1, 1);
2232     vextracti128(vtmp2, src2, 1);
2233     vpmovzxdq(vtmp1, vtmp1, vlen_enc);
2234     vpmovzxdq(vtmp2, vtmp2, vlen_enc);
2235     vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::Q,  vlen_enc, scratch);
2236     vpshufd(vtmp3, vtmp3, 8, vlen_enc);
2237     vpermq(vtmp3, vtmp3, 0x80, vlen_enc);
2238     vpblendd(dst, dst, vtmp3, 0xf0, vlen_enc);
2239     break;
2240   default:
2241     assert(false, "Should not reach here");
2242   }
2243 }
2244 
2245 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2246   switch(typ) {
2247     case T_BYTE:
2248       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2249       break;
2250     case T_SHORT:
2251       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2252       break;
2253     case T_INT:
2254     case T_FLOAT:
2255       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2256       break;
2257     case T_LONG:
2258     case T_DOUBLE:
2259       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2260       break;
2261     default:
2262       assert(false,"Should not reach here.");
2263       break;
2264   }
2265 }
2266 
2267 void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2,
2268                                    XMMRegister vtmp1, XMMRegister vtmp2, KRegister mask) {
2269   switch(vlen) {
2270     case 4:
2271       assert(vtmp1 != xnoreg, "required.");
2272       // Broadcast lower 32 bits to 128 bits before ptest
2273       pshufd(vtmp1, src1, 0x0);
2274       if (bt == BoolTest::overflow) {
2275         assert(vtmp2 != xnoreg, "required.");
2276         pshufd(vtmp2, src2, 0x0);
2277       } else {
2278         assert(vtmp2 == xnoreg, "required.");
2279         vtmp2 = src2;
2280       }
2281       ptest(vtmp1, vtmp2);
2282      break;
2283     case 8:
2284       assert(vtmp1 != xnoreg, "required.");
2285       // Broadcast lower 64 bits to 128 bits before ptest
2286       pshufd(vtmp1, src1, 0x4);
2287       if (bt == BoolTest::overflow) {
2288         assert(vtmp2 != xnoreg, "required.");
2289         pshufd(vtmp2, src2, 0x4);
2290       } else {
2291         assert(vtmp2 == xnoreg, "required.");
2292         vtmp2 = src2;
2293       }
2294       ptest(vtmp1, vtmp2);
2295      break;
2296     case 16:
2297       assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2298       ptest(src1, src2);
2299       break;
2300     case 32:
2301       assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2302       vptest(src1, src2, Assembler::AVX_256bit);
2303       break;
2304     case 64:
2305       {
2306         assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2307         evpcmpeqb(mask, src1, src2, Assembler::AVX_512bit);
2308         if (bt == BoolTest::ne) {
2309           ktestql(mask, mask);
2310         } else {
2311           assert(bt == BoolTest::overflow, "required");
2312           kortestql(mask, mask);
2313         }
2314       }
2315       break;
2316     default:
2317       assert(false,"Should not reach here.");
2318       break;
2319   }
2320 }
2321 
2322 //-------------------------------------------------------------------------------------------
2323 
2324 // IndexOf for constant substrings with size >= 8 chars
2325 // which don't need to be loaded through stack.
2326 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2327                                          Register cnt1, Register cnt2,
2328                                          int int_cnt2,  Register result,
2329                                          XMMRegister vec, Register tmp,
2330                                          int ae) {
2331   ShortBranchVerifier sbv(this);
2332   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2333   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2334 
2335   // This method uses the pcmpestri instruction with bound registers
2336   //   inputs:
2337   //     xmm - substring
2338   //     rax - substring length (elements count)
2339   //     mem - scanned string
2340   //     rdx - string length (elements count)
2341   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2342   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2343   //   outputs:
2344   //     rcx - matched index in string
2345   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2346   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2347   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2348   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2349   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2350 
2351   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2352         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2353         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2354 
2355   // Note, inline_string_indexOf() generates checks:
2356   // if (substr.count > string.count) return -1;
2357   // if (substr.count == 0) return 0;
2358   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2359 
2360   // Load substring.
2361   if (ae == StrIntrinsicNode::UL) {
2362     pmovzxbw(vec, Address(str2, 0));
2363   } else {
2364     movdqu(vec, Address(str2, 0));
2365   }
2366   movl(cnt2, int_cnt2);
2367   movptr(result, str1); // string addr
2368 
2369   if (int_cnt2 > stride) {
2370     jmpb(SCAN_TO_SUBSTR);
2371 
2372     // Reload substr for rescan, this code
2373     // is executed only for large substrings (> 8 chars)
2374     bind(RELOAD_SUBSTR);
2375     if (ae == StrIntrinsicNode::UL) {
2376       pmovzxbw(vec, Address(str2, 0));
2377     } else {
2378       movdqu(vec, Address(str2, 0));
2379     }
2380     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2381 
2382     bind(RELOAD_STR);
2383     // We came here after the beginning of the substring was
2384     // matched but the rest of it was not so we need to search
2385     // again. Start from the next element after the previous match.
2386 
2387     // cnt2 is number of substring reminding elements and
2388     // cnt1 is number of string reminding elements when cmp failed.
2389     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2390     subl(cnt1, cnt2);
2391     addl(cnt1, int_cnt2);
2392     movl(cnt2, int_cnt2); // Now restore cnt2
2393 
2394     decrementl(cnt1);     // Shift to next element
2395     cmpl(cnt1, cnt2);
2396     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2397 
2398     addptr(result, (1<<scale1));
2399 
2400   } // (int_cnt2 > 8)
2401 
2402   // Scan string for start of substr in 16-byte vectors
2403   bind(SCAN_TO_SUBSTR);
2404   pcmpestri(vec, Address(result, 0), mode);
2405   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2406   subl(cnt1, stride);
2407   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2408   cmpl(cnt1, cnt2);
2409   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2410   addptr(result, 16);
2411   jmpb(SCAN_TO_SUBSTR);
2412 
2413   // Found a potential substr
2414   bind(FOUND_CANDIDATE);
2415   // Matched whole vector if first element matched (tmp(rcx) == 0).
2416   if (int_cnt2 == stride) {
2417     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2418   } else { // int_cnt2 > 8
2419     jccb(Assembler::overflow, FOUND_SUBSTR);
2420   }
2421   // After pcmpestri tmp(rcx) contains matched element index
2422   // Compute start addr of substr
2423   lea(result, Address(result, tmp, scale1));
2424 
2425   // Make sure string is still long enough
2426   subl(cnt1, tmp);
2427   cmpl(cnt1, cnt2);
2428   if (int_cnt2 == stride) {
2429     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2430   } else { // int_cnt2 > 8
2431     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2432   }
2433   // Left less then substring.
2434 
2435   bind(RET_NOT_FOUND);
2436   movl(result, -1);
2437   jmp(EXIT);
2438 
2439   if (int_cnt2 > stride) {
2440     // This code is optimized for the case when whole substring
2441     // is matched if its head is matched.
2442     bind(MATCH_SUBSTR_HEAD);
2443     pcmpestri(vec, Address(result, 0), mode);
2444     // Reload only string if does not match
2445     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2446 
2447     Label CONT_SCAN_SUBSTR;
2448     // Compare the rest of substring (> 8 chars).
2449     bind(FOUND_SUBSTR);
2450     // First 8 chars are already matched.
2451     negptr(cnt2);
2452     addptr(cnt2, stride);
2453 
2454     bind(SCAN_SUBSTR);
2455     subl(cnt1, stride);
2456     cmpl(cnt2, -stride); // Do not read beyond substring
2457     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2458     // Back-up strings to avoid reading beyond substring:
2459     // cnt1 = cnt1 - cnt2 + 8
2460     addl(cnt1, cnt2); // cnt2 is negative
2461     addl(cnt1, stride);
2462     movl(cnt2, stride); negptr(cnt2);
2463     bind(CONT_SCAN_SUBSTR);
2464     if (int_cnt2 < (int)G) {
2465       int tail_off1 = int_cnt2<<scale1;
2466       int tail_off2 = int_cnt2<<scale2;
2467       if (ae == StrIntrinsicNode::UL) {
2468         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2469       } else {
2470         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2471       }
2472       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2473     } else {
2474       // calculate index in register to avoid integer overflow (int_cnt2*2)
2475       movl(tmp, int_cnt2);
2476       addptr(tmp, cnt2);
2477       if (ae == StrIntrinsicNode::UL) {
2478         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2479       } else {
2480         movdqu(vec, Address(str2, tmp, scale2, 0));
2481       }
2482       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2483     }
2484     // Need to reload strings pointers if not matched whole vector
2485     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2486     addptr(cnt2, stride);
2487     jcc(Assembler::negative, SCAN_SUBSTR);
2488     // Fall through if found full substring
2489 
2490   } // (int_cnt2 > 8)
2491 
2492   bind(RET_FOUND);
2493   // Found result if we matched full small substring.
2494   // Compute substr offset
2495   subptr(result, str1);
2496   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2497     shrl(result, 1); // index
2498   }
2499   bind(EXIT);
2500 
2501 } // string_indexofC8
2502 
2503 // Small strings are loaded through stack if they cross page boundary.
2504 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2505                                        Register cnt1, Register cnt2,
2506                                        int int_cnt2,  Register result,
2507                                        XMMRegister vec, Register tmp,
2508                                        int ae) {
2509   ShortBranchVerifier sbv(this);
2510   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2511   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2512 
2513   //
2514   // int_cnt2 is length of small (< 8 chars) constant substring
2515   // or (-1) for non constant substring in which case its length
2516   // is in cnt2 register.
2517   //
2518   // Note, inline_string_indexOf() generates checks:
2519   // if (substr.count > string.count) return -1;
2520   // if (substr.count == 0) return 0;
2521   //
2522   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2523   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2524   // This method uses the pcmpestri instruction with bound registers
2525   //   inputs:
2526   //     xmm - substring
2527   //     rax - substring length (elements count)
2528   //     mem - scanned string
2529   //     rdx - string length (elements count)
2530   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2531   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2532   //   outputs:
2533   //     rcx - matched index in string
2534   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2535   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2536   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2537   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2538 
2539   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2540         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2541         FOUND_CANDIDATE;
2542 
2543   { //========================================================
2544     // We don't know where these strings are located
2545     // and we can't read beyond them. Load them through stack.
2546     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2547 
2548     movptr(tmp, rsp); // save old SP
2549 
2550     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2551       if (int_cnt2 == (1>>scale2)) { // One byte
2552         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2553         load_unsigned_byte(result, Address(str2, 0));
2554         movdl(vec, result); // move 32 bits
2555       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2556         // Not enough header space in 32-bit VM: 12+3 = 15.
2557         movl(result, Address(str2, -1));
2558         shrl(result, 8);
2559         movdl(vec, result); // move 32 bits
2560       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2561         load_unsigned_short(result, Address(str2, 0));
2562         movdl(vec, result); // move 32 bits
2563       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2564         movdl(vec, Address(str2, 0)); // move 32 bits
2565       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2566         movq(vec, Address(str2, 0));  // move 64 bits
2567       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2568         // Array header size is 12 bytes in 32-bit VM
2569         // + 6 bytes for 3 chars == 18 bytes,
2570         // enough space to load vec and shift.
2571         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2572         if (ae == StrIntrinsicNode::UL) {
2573           int tail_off = int_cnt2-8;
2574           pmovzxbw(vec, Address(str2, tail_off));
2575           psrldq(vec, -2*tail_off);
2576         }
2577         else {
2578           int tail_off = int_cnt2*(1<<scale2);
2579           movdqu(vec, Address(str2, tail_off-16));
2580           psrldq(vec, 16-tail_off);
2581         }
2582       }
2583     } else { // not constant substring
2584       cmpl(cnt2, stride);
2585       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2586 
2587       // We can read beyond string if srt+16 does not cross page boundary
2588       // since heaps are aligned and mapped by pages.
2589       assert(os::vm_page_size() < (int)G, "default page should be small");
2590       movl(result, str2); // We need only low 32 bits
2591       andl(result, (os::vm_page_size()-1));
2592       cmpl(result, (os::vm_page_size()-16));
2593       jccb(Assembler::belowEqual, CHECK_STR);
2594 
2595       // Move small strings to stack to allow load 16 bytes into vec.
2596       subptr(rsp, 16);
2597       int stk_offset = wordSize-(1<<scale2);
2598       push(cnt2);
2599 
2600       bind(COPY_SUBSTR);
2601       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2602         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2603         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2604       } else if (ae == StrIntrinsicNode::UU) {
2605         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2606         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2607       }
2608       decrement(cnt2);
2609       jccb(Assembler::notZero, COPY_SUBSTR);
2610 
2611       pop(cnt2);
2612       movptr(str2, rsp);  // New substring address
2613     } // non constant
2614 
2615     bind(CHECK_STR);
2616     cmpl(cnt1, stride);
2617     jccb(Assembler::aboveEqual, BIG_STRINGS);
2618 
2619     // Check cross page boundary.
2620     movl(result, str1); // We need only low 32 bits
2621     andl(result, (os::vm_page_size()-1));
2622     cmpl(result, (os::vm_page_size()-16));
2623     jccb(Assembler::belowEqual, BIG_STRINGS);
2624 
2625     subptr(rsp, 16);
2626     int stk_offset = -(1<<scale1);
2627     if (int_cnt2 < 0) { // not constant
2628       push(cnt2);
2629       stk_offset += wordSize;
2630     }
2631     movl(cnt2, cnt1);
2632 
2633     bind(COPY_STR);
2634     if (ae == StrIntrinsicNode::LL) {
2635       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2636       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2637     } else {
2638       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2639       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2640     }
2641     decrement(cnt2);
2642     jccb(Assembler::notZero, COPY_STR);
2643 
2644     if (int_cnt2 < 0) { // not constant
2645       pop(cnt2);
2646     }
2647     movptr(str1, rsp);  // New string address
2648 
2649     bind(BIG_STRINGS);
2650     // Load substring.
2651     if (int_cnt2 < 0) { // -1
2652       if (ae == StrIntrinsicNode::UL) {
2653         pmovzxbw(vec, Address(str2, 0));
2654       } else {
2655         movdqu(vec, Address(str2, 0));
2656       }
2657       push(cnt2);       // substr count
2658       push(str2);       // substr addr
2659       push(str1);       // string addr
2660     } else {
2661       // Small (< 8 chars) constant substrings are loaded already.
2662       movl(cnt2, int_cnt2);
2663     }
2664     push(tmp);  // original SP
2665 
2666   } // Finished loading
2667 
2668   //========================================================
2669   // Start search
2670   //
2671 
2672   movptr(result, str1); // string addr
2673 
2674   if (int_cnt2  < 0) {  // Only for non constant substring
2675     jmpb(SCAN_TO_SUBSTR);
2676 
2677     // SP saved at sp+0
2678     // String saved at sp+1*wordSize
2679     // Substr saved at sp+2*wordSize
2680     // Substr count saved at sp+3*wordSize
2681 
2682     // Reload substr for rescan, this code
2683     // is executed only for large substrings (> 8 chars)
2684     bind(RELOAD_SUBSTR);
2685     movptr(str2, Address(rsp, 2*wordSize));
2686     movl(cnt2, Address(rsp, 3*wordSize));
2687     if (ae == StrIntrinsicNode::UL) {
2688       pmovzxbw(vec, Address(str2, 0));
2689     } else {
2690       movdqu(vec, Address(str2, 0));
2691     }
2692     // We came here after the beginning of the substring was
2693     // matched but the rest of it was not so we need to search
2694     // again. Start from the next element after the previous match.
2695     subptr(str1, result); // Restore counter
2696     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2697       shrl(str1, 1);
2698     }
2699     addl(cnt1, str1);
2700     decrementl(cnt1);   // Shift to next element
2701     cmpl(cnt1, cnt2);
2702     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2703 
2704     addptr(result, (1<<scale1));
2705   } // non constant
2706 
2707   // Scan string for start of substr in 16-byte vectors
2708   bind(SCAN_TO_SUBSTR);
2709   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2710   pcmpestri(vec, Address(result, 0), mode);
2711   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2712   subl(cnt1, stride);
2713   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2714   cmpl(cnt1, cnt2);
2715   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2716   addptr(result, 16);
2717 
2718   bind(ADJUST_STR);
2719   cmpl(cnt1, stride); // Do not read beyond string
2720   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2721   // Back-up string to avoid reading beyond string.
2722   lea(result, Address(result, cnt1, scale1, -16));
2723   movl(cnt1, stride);
2724   jmpb(SCAN_TO_SUBSTR);
2725 
2726   // Found a potential substr
2727   bind(FOUND_CANDIDATE);
2728   // After pcmpestri tmp(rcx) contains matched element index
2729 
2730   // Make sure string is still long enough
2731   subl(cnt1, tmp);
2732   cmpl(cnt1, cnt2);
2733   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
2734   // Left less then substring.
2735 
2736   bind(RET_NOT_FOUND);
2737   movl(result, -1);
2738   jmp(CLEANUP);
2739 
2740   bind(FOUND_SUBSTR);
2741   // Compute start addr of substr
2742   lea(result, Address(result, tmp, scale1));
2743   if (int_cnt2 > 0) { // Constant substring
2744     // Repeat search for small substring (< 8 chars)
2745     // from new point without reloading substring.
2746     // Have to check that we don't read beyond string.
2747     cmpl(tmp, stride-int_cnt2);
2748     jccb(Assembler::greater, ADJUST_STR);
2749     // Fall through if matched whole substring.
2750   } else { // non constant
2751     assert(int_cnt2 == -1, "should be != 0");
2752 
2753     addl(tmp, cnt2);
2754     // Found result if we matched whole substring.
2755     cmpl(tmp, stride);
2756     jcc(Assembler::lessEqual, RET_FOUND);
2757 
2758     // Repeat search for small substring (<= 8 chars)
2759     // from new point 'str1' without reloading substring.
2760     cmpl(cnt2, stride);
2761     // Have to check that we don't read beyond string.
2762     jccb(Assembler::lessEqual, ADJUST_STR);
2763 
2764     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
2765     // Compare the rest of substring (> 8 chars).
2766     movptr(str1, result);
2767 
2768     cmpl(tmp, cnt2);
2769     // First 8 chars are already matched.
2770     jccb(Assembler::equal, CHECK_NEXT);
2771 
2772     bind(SCAN_SUBSTR);
2773     pcmpestri(vec, Address(str1, 0), mode);
2774     // Need to reload strings pointers if not matched whole vector
2775     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2776 
2777     bind(CHECK_NEXT);
2778     subl(cnt2, stride);
2779     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
2780     addptr(str1, 16);
2781     if (ae == StrIntrinsicNode::UL) {
2782       addptr(str2, 8);
2783     } else {
2784       addptr(str2, 16);
2785     }
2786     subl(cnt1, stride);
2787     cmpl(cnt2, stride); // Do not read beyond substring
2788     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
2789     // Back-up strings to avoid reading beyond substring.
2790 
2791     if (ae == StrIntrinsicNode::UL) {
2792       lea(str2, Address(str2, cnt2, scale2, -8));
2793       lea(str1, Address(str1, cnt2, scale1, -16));
2794     } else {
2795       lea(str2, Address(str2, cnt2, scale2, -16));
2796       lea(str1, Address(str1, cnt2, scale1, -16));
2797     }
2798     subl(cnt1, cnt2);
2799     movl(cnt2, stride);
2800     addl(cnt1, stride);
2801     bind(CONT_SCAN_SUBSTR);
2802     if (ae == StrIntrinsicNode::UL) {
2803       pmovzxbw(vec, Address(str2, 0));
2804     } else {
2805       movdqu(vec, Address(str2, 0));
2806     }
2807     jmp(SCAN_SUBSTR);
2808 
2809     bind(RET_FOUND_LONG);
2810     movptr(str1, Address(rsp, wordSize));
2811   } // non constant
2812 
2813   bind(RET_FOUND);
2814   // Compute substr offset
2815   subptr(result, str1);
2816   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2817     shrl(result, 1); // index
2818   }
2819   bind(CLEANUP);
2820   pop(rsp); // restore SP
2821 
2822 } // string_indexof
2823 
2824 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
2825                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
2826   ShortBranchVerifier sbv(this);
2827   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2828 
2829   int stride = 8;
2830 
2831   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
2832         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
2833         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
2834         FOUND_SEQ_CHAR, DONE_LABEL;
2835 
2836   movptr(result, str1);
2837   if (UseAVX >= 2) {
2838     cmpl(cnt1, stride);
2839     jcc(Assembler::less, SCAN_TO_CHAR);
2840     cmpl(cnt1, 2*stride);
2841     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
2842     movdl(vec1, ch);
2843     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
2844     vpxor(vec2, vec2);
2845     movl(tmp, cnt1);
2846     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
2847     andl(cnt1,0x0000000F);  //tail count (in chars)
2848 
2849     bind(SCAN_TO_16_CHAR_LOOP);
2850     vmovdqu(vec3, Address(result, 0));
2851     vpcmpeqw(vec3, vec3, vec1, 1);
2852     vptest(vec2, vec3);
2853     jcc(Assembler::carryClear, FOUND_CHAR);
2854     addptr(result, 32);
2855     subl(tmp, 2*stride);
2856     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
2857     jmp(SCAN_TO_8_CHAR);
2858     bind(SCAN_TO_8_CHAR_INIT);
2859     movdl(vec1, ch);
2860     pshuflw(vec1, vec1, 0x00);
2861     pshufd(vec1, vec1, 0);
2862     pxor(vec2, vec2);
2863   }
2864   bind(SCAN_TO_8_CHAR);
2865   cmpl(cnt1, stride);
2866   jcc(Assembler::less, SCAN_TO_CHAR);
2867   if (UseAVX < 2) {
2868     movdl(vec1, ch);
2869     pshuflw(vec1, vec1, 0x00);
2870     pshufd(vec1, vec1, 0);
2871     pxor(vec2, vec2);
2872   }
2873   movl(tmp, cnt1);
2874   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
2875   andl(cnt1,0x00000007);  //tail count (in chars)
2876 
2877   bind(SCAN_TO_8_CHAR_LOOP);
2878   movdqu(vec3, Address(result, 0));
2879   pcmpeqw(vec3, vec1);
2880   ptest(vec2, vec3);
2881   jcc(Assembler::carryClear, FOUND_CHAR);
2882   addptr(result, 16);
2883   subl(tmp, stride);
2884   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
2885   bind(SCAN_TO_CHAR);
2886   testl(cnt1, cnt1);
2887   jcc(Assembler::zero, RET_NOT_FOUND);
2888   bind(SCAN_TO_CHAR_LOOP);
2889   load_unsigned_short(tmp, Address(result, 0));
2890   cmpl(ch, tmp);
2891   jccb(Assembler::equal, FOUND_SEQ_CHAR);
2892   addptr(result, 2);
2893   subl(cnt1, 1);
2894   jccb(Assembler::zero, RET_NOT_FOUND);
2895   jmp(SCAN_TO_CHAR_LOOP);
2896 
2897   bind(RET_NOT_FOUND);
2898   movl(result, -1);
2899   jmpb(DONE_LABEL);
2900 
2901   bind(FOUND_CHAR);
2902   if (UseAVX >= 2) {
2903     vpmovmskb(tmp, vec3);
2904   } else {
2905     pmovmskb(tmp, vec3);
2906   }
2907   bsfl(ch, tmp);
2908   addptr(result, ch);
2909 
2910   bind(FOUND_SEQ_CHAR);
2911   subptr(result, str1);
2912   shrl(result, 1);
2913 
2914   bind(DONE_LABEL);
2915 } // string_indexof_char
2916 
2917 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
2918                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
2919   ShortBranchVerifier sbv(this);
2920   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2921 
2922   int stride = 16;
2923 
2924   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
2925         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
2926         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
2927         FOUND_SEQ_CHAR, DONE_LABEL;
2928 
2929   movptr(result, str1);
2930   if (UseAVX >= 2) {
2931     cmpl(cnt1, stride);
2932     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
2933     cmpl(cnt1, stride*2);
2934     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
2935     movdl(vec1, ch);
2936     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
2937     vpxor(vec2, vec2);
2938     movl(tmp, cnt1);
2939     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
2940     andl(cnt1,0x0000001F);  //tail count (in chars)
2941 
2942     bind(SCAN_TO_32_CHAR_LOOP);
2943     vmovdqu(vec3, Address(result, 0));
2944     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
2945     vptest(vec2, vec3);
2946     jcc(Assembler::carryClear, FOUND_CHAR);
2947     addptr(result, 32);
2948     subl(tmp, stride*2);
2949     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
2950     jmp(SCAN_TO_16_CHAR);
2951 
2952     bind(SCAN_TO_16_CHAR_INIT);
2953     movdl(vec1, ch);
2954     pxor(vec2, vec2);
2955     pshufb(vec1, vec2);
2956   }
2957 
2958   bind(SCAN_TO_16_CHAR);
2959   cmpl(cnt1, stride);
2960   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entires left
2961   if (UseAVX < 2) {
2962     movdl(vec1, ch);
2963     pxor(vec2, vec2);
2964     pshufb(vec1, vec2);
2965   }
2966   movl(tmp, cnt1);
2967   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
2968   andl(cnt1,0x0000000F);  //tail count (in bytes)
2969 
2970   bind(SCAN_TO_16_CHAR_LOOP);
2971   movdqu(vec3, Address(result, 0));
2972   pcmpeqb(vec3, vec1);
2973   ptest(vec2, vec3);
2974   jcc(Assembler::carryClear, FOUND_CHAR);
2975   addptr(result, 16);
2976   subl(tmp, stride);
2977   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
2978 
2979   bind(SCAN_TO_CHAR_INIT);
2980   testl(cnt1, cnt1);
2981   jcc(Assembler::zero, RET_NOT_FOUND);
2982   bind(SCAN_TO_CHAR_LOOP);
2983   load_unsigned_byte(tmp, Address(result, 0));
2984   cmpl(ch, tmp);
2985   jccb(Assembler::equal, FOUND_SEQ_CHAR);
2986   addptr(result, 1);
2987   subl(cnt1, 1);
2988   jccb(Assembler::zero, RET_NOT_FOUND);
2989   jmp(SCAN_TO_CHAR_LOOP);
2990 
2991   bind(RET_NOT_FOUND);
2992   movl(result, -1);
2993   jmpb(DONE_LABEL);
2994 
2995   bind(FOUND_CHAR);
2996   if (UseAVX >= 2) {
2997     vpmovmskb(tmp, vec3);
2998   } else {
2999     pmovmskb(tmp, vec3);
3000   }
3001   bsfl(ch, tmp);
3002   addptr(result, ch);
3003 
3004   bind(FOUND_SEQ_CHAR);
3005   subptr(result, str1);
3006 
3007   bind(DONE_LABEL);
3008 } // stringL_indexof_char
3009 
3010 // helper function for string_compare
3011 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3012                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3013                                            Address::ScaleFactor scale2, Register index, int ae) {
3014   if (ae == StrIntrinsicNode::LL) {
3015     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3016     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3017   } else if (ae == StrIntrinsicNode::UU) {
3018     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3019     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3020   } else {
3021     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3022     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3023   }
3024 }
3025 
3026 // Compare strings, used for char[] and byte[].
3027 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3028                                        Register cnt1, Register cnt2, Register result,
3029                                        XMMRegister vec1, int ae, KRegister mask) {
3030   ShortBranchVerifier sbv(this);
3031   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3032   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3033   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3034   int stride2x2 = 0x40;
3035   Address::ScaleFactor scale = Address::no_scale;
3036   Address::ScaleFactor scale1 = Address::no_scale;
3037   Address::ScaleFactor scale2 = Address::no_scale;
3038 
3039   if (ae != StrIntrinsicNode::LL) {
3040     stride2x2 = 0x20;
3041   }
3042 
3043   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3044     shrl(cnt2, 1);
3045   }
3046   // Compute the minimum of the string lengths and the
3047   // difference of the string lengths (stack).
3048   // Do the conditional move stuff
3049   movl(result, cnt1);
3050   subl(cnt1, cnt2);
3051   push(cnt1);
3052   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3053 
3054   // Is the minimum length zero?
3055   testl(cnt2, cnt2);
3056   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3057   if (ae == StrIntrinsicNode::LL) {
3058     // Load first bytes
3059     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3060     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3061   } else if (ae == StrIntrinsicNode::UU) {
3062     // Load first characters
3063     load_unsigned_short(result, Address(str1, 0));
3064     load_unsigned_short(cnt1, Address(str2, 0));
3065   } else {
3066     load_unsigned_byte(result, Address(str1, 0));
3067     load_unsigned_short(cnt1, Address(str2, 0));
3068   }
3069   subl(result, cnt1);
3070   jcc(Assembler::notZero,  POP_LABEL);
3071 
3072   if (ae == StrIntrinsicNode::UU) {
3073     // Divide length by 2 to get number of chars
3074     shrl(cnt2, 1);
3075   }
3076   cmpl(cnt2, 1);
3077   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3078 
3079   // Check if the strings start at the same location and setup scale and stride
3080   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3081     cmpptr(str1, str2);
3082     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3083     if (ae == StrIntrinsicNode::LL) {
3084       scale = Address::times_1;
3085       stride = 16;
3086     } else {
3087       scale = Address::times_2;
3088       stride = 8;
3089     }
3090   } else {
3091     scale1 = Address::times_1;
3092     scale2 = Address::times_2;
3093     // scale not used
3094     stride = 8;
3095   }
3096 
3097   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3098     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3099     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3100     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3101     Label COMPARE_TAIL_LONG;
3102     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3103 
3104     int pcmpmask = 0x19;
3105     if (ae == StrIntrinsicNode::LL) {
3106       pcmpmask &= ~0x01;
3107     }
3108 
3109     // Setup to compare 16-chars (32-bytes) vectors,
3110     // start from first character again because it has aligned address.
3111     if (ae == StrIntrinsicNode::LL) {
3112       stride2 = 32;
3113     } else {
3114       stride2 = 16;
3115     }
3116     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3117       adr_stride = stride << scale;
3118     } else {
3119       adr_stride1 = 8;  //stride << scale1;
3120       adr_stride2 = 16; //stride << scale2;
3121     }
3122 
3123     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3124     // rax and rdx are used by pcmpestri as elements counters
3125     movl(result, cnt2);
3126     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3127     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3128 
3129     // fast path : compare first 2 8-char vectors.
3130     bind(COMPARE_16_CHARS);
3131     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3132       movdqu(vec1, Address(str1, 0));
3133     } else {
3134       pmovzxbw(vec1, Address(str1, 0));
3135     }
3136     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3137     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3138 
3139     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3140       movdqu(vec1, Address(str1, adr_stride));
3141       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3142     } else {
3143       pmovzxbw(vec1, Address(str1, adr_stride1));
3144       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3145     }
3146     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3147     addl(cnt1, stride);
3148 
3149     // Compare the characters at index in cnt1
3150     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3151     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3152     subl(result, cnt2);
3153     jmp(POP_LABEL);
3154 
3155     // Setup the registers to start vector comparison loop
3156     bind(COMPARE_WIDE_VECTORS);
3157     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3158       lea(str1, Address(str1, result, scale));
3159       lea(str2, Address(str2, result, scale));
3160     } else {
3161       lea(str1, Address(str1, result, scale1));
3162       lea(str2, Address(str2, result, scale2));
3163     }
3164     subl(result, stride2);
3165     subl(cnt2, stride2);
3166     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3167     negptr(result);
3168 
3169     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3170     bind(COMPARE_WIDE_VECTORS_LOOP);
3171 
3172 #ifdef _LP64
3173     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3174       cmpl(cnt2, stride2x2);
3175       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3176       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3177       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3178 
3179       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3180       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3181         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3182         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3183       } else {
3184         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3185         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3186       }
3187       kortestql(mask, mask);
3188       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3189       addptr(result, stride2x2);  // update since we already compared at this addr
3190       subl(cnt2, stride2x2);      // and sub the size too
3191       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3192 
3193       vpxor(vec1, vec1);
3194       jmpb(COMPARE_WIDE_TAIL);
3195     }//if (VM_Version::supports_avx512vlbw())
3196 #endif // _LP64
3197 
3198 
3199     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3200     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3201       vmovdqu(vec1, Address(str1, result, scale));
3202       vpxor(vec1, Address(str2, result, scale));
3203     } else {
3204       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3205       vpxor(vec1, Address(str2, result, scale2));
3206     }
3207     vptest(vec1, vec1);
3208     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3209     addptr(result, stride2);
3210     subl(cnt2, stride2);
3211     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3212     // clean upper bits of YMM registers
3213     vpxor(vec1, vec1);
3214 
3215     // compare wide vectors tail
3216     bind(COMPARE_WIDE_TAIL);
3217     testptr(result, result);
3218     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3219 
3220     movl(result, stride2);
3221     movl(cnt2, result);
3222     negptr(result);
3223     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3224 
3225     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3226     bind(VECTOR_NOT_EQUAL);
3227     // clean upper bits of YMM registers
3228     vpxor(vec1, vec1);
3229     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3230       lea(str1, Address(str1, result, scale));
3231       lea(str2, Address(str2, result, scale));
3232     } else {
3233       lea(str1, Address(str1, result, scale1));
3234       lea(str2, Address(str2, result, scale2));
3235     }
3236     jmp(COMPARE_16_CHARS);
3237 
3238     // Compare tail chars, length between 1 to 15 chars
3239     bind(COMPARE_TAIL_LONG);
3240     movl(cnt2, result);
3241     cmpl(cnt2, stride);
3242     jcc(Assembler::less, COMPARE_SMALL_STR);
3243 
3244     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3245       movdqu(vec1, Address(str1, 0));
3246     } else {
3247       pmovzxbw(vec1, Address(str1, 0));
3248     }
3249     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3250     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3251     subptr(cnt2, stride);
3252     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3253     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3254       lea(str1, Address(str1, result, scale));
3255       lea(str2, Address(str2, result, scale));
3256     } else {
3257       lea(str1, Address(str1, result, scale1));
3258       lea(str2, Address(str2, result, scale2));
3259     }
3260     negptr(cnt2);
3261     jmpb(WHILE_HEAD_LABEL);
3262 
3263     bind(COMPARE_SMALL_STR);
3264   } else if (UseSSE42Intrinsics) {
3265     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3266     int pcmpmask = 0x19;
3267     // Setup to compare 8-char (16-byte) vectors,
3268     // start from first character again because it has aligned address.
3269     movl(result, cnt2);
3270     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3271     if (ae == StrIntrinsicNode::LL) {
3272       pcmpmask &= ~0x01;
3273     }
3274     jcc(Assembler::zero, COMPARE_TAIL);
3275     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3276       lea(str1, Address(str1, result, scale));
3277       lea(str2, Address(str2, result, scale));
3278     } else {
3279       lea(str1, Address(str1, result, scale1));
3280       lea(str2, Address(str2, result, scale2));
3281     }
3282     negptr(result);
3283 
3284     // pcmpestri
3285     //   inputs:
3286     //     vec1- substring
3287     //     rax - negative string length (elements count)
3288     //     mem - scanned string
3289     //     rdx - string length (elements count)
3290     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3291     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3292     //   outputs:
3293     //     rcx - first mismatched element index
3294     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3295 
3296     bind(COMPARE_WIDE_VECTORS);
3297     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3298       movdqu(vec1, Address(str1, result, scale));
3299       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3300     } else {
3301       pmovzxbw(vec1, Address(str1, result, scale1));
3302       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3303     }
3304     // After pcmpestri cnt1(rcx) contains mismatched element index
3305 
3306     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3307     addptr(result, stride);
3308     subptr(cnt2, stride);
3309     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3310 
3311     // compare wide vectors tail
3312     testptr(result, result);
3313     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3314 
3315     movl(cnt2, stride);
3316     movl(result, stride);
3317     negptr(result);
3318     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3319       movdqu(vec1, Address(str1, result, scale));
3320       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3321     } else {
3322       pmovzxbw(vec1, Address(str1, result, scale1));
3323       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3324     }
3325     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3326 
3327     // Mismatched characters in the vectors
3328     bind(VECTOR_NOT_EQUAL);
3329     addptr(cnt1, result);
3330     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3331     subl(result, cnt2);
3332     jmpb(POP_LABEL);
3333 
3334     bind(COMPARE_TAIL); // limit is zero
3335     movl(cnt2, result);
3336     // Fallthru to tail compare
3337   }
3338   // Shift str2 and str1 to the end of the arrays, negate min
3339   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3340     lea(str1, Address(str1, cnt2, scale));
3341     lea(str2, Address(str2, cnt2, scale));
3342   } else {
3343     lea(str1, Address(str1, cnt2, scale1));
3344     lea(str2, Address(str2, cnt2, scale2));
3345   }
3346   decrementl(cnt2);  // first character was compared already
3347   negptr(cnt2);
3348 
3349   // Compare the rest of the elements
3350   bind(WHILE_HEAD_LABEL);
3351   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3352   subl(result, cnt1);
3353   jccb(Assembler::notZero, POP_LABEL);
3354   increment(cnt2);
3355   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3356 
3357   // Strings are equal up to min length.  Return the length difference.
3358   bind(LENGTH_DIFF_LABEL);
3359   pop(result);
3360   if (ae == StrIntrinsicNode::UU) {
3361     // Divide diff by 2 to get number of chars
3362     sarl(result, 1);
3363   }
3364   jmpb(DONE_LABEL);
3365 
3366 #ifdef _LP64
3367   if (VM_Version::supports_avx512vlbw()) {
3368 
3369     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3370 
3371     kmovql(cnt1, mask);
3372     notq(cnt1);
3373     bsfq(cnt2, cnt1);
3374     if (ae != StrIntrinsicNode::LL) {
3375       // Divide diff by 2 to get number of chars
3376       sarl(cnt2, 1);
3377     }
3378     addq(result, cnt2);
3379     if (ae == StrIntrinsicNode::LL) {
3380       load_unsigned_byte(cnt1, Address(str2, result));
3381       load_unsigned_byte(result, Address(str1, result));
3382     } else if (ae == StrIntrinsicNode::UU) {
3383       load_unsigned_short(cnt1, Address(str2, result, scale));
3384       load_unsigned_short(result, Address(str1, result, scale));
3385     } else {
3386       load_unsigned_short(cnt1, Address(str2, result, scale2));
3387       load_unsigned_byte(result, Address(str1, result, scale1));
3388     }
3389     subl(result, cnt1);
3390     jmpb(POP_LABEL);
3391   }//if (VM_Version::supports_avx512vlbw())
3392 #endif // _LP64
3393 
3394   // Discard the stored length difference
3395   bind(POP_LABEL);
3396   pop(cnt1);
3397 
3398   // That's it
3399   bind(DONE_LABEL);
3400   if(ae == StrIntrinsicNode::UL) {
3401     negl(result);
3402   }
3403 
3404 }
3405 
3406 // Search for Non-ASCII character (Negative byte value) in a byte array,
3407 // return true if it has any and false otherwise.
3408 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3409 //   @IntrinsicCandidate
3410 //   private static boolean hasNegatives(byte[] ba, int off, int len) {
3411 //     for (int i = off; i < off + len; i++) {
3412 //       if (ba[i] < 0) {
3413 //         return true;
3414 //       }
3415 //     }
3416 //     return false;
3417 //   }
3418 void C2_MacroAssembler::has_negatives(Register ary1, Register len,
3419   Register result, Register tmp1,
3420   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3421   // rsi: byte array
3422   // rcx: len
3423   // rax: result
3424   ShortBranchVerifier sbv(this);
3425   assert_different_registers(ary1, len, result, tmp1);
3426   assert_different_registers(vec1, vec2);
3427   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3428 
3429   // len == 0
3430   testl(len, len);
3431   jcc(Assembler::zero, FALSE_LABEL);
3432 
3433   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3434     VM_Version::supports_avx512vlbw() &&
3435     VM_Version::supports_bmi2()) {
3436 
3437     Label test_64_loop, test_tail;
3438     Register tmp3_aliased = len;
3439 
3440     movl(tmp1, len);
3441     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3442 
3443     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
3444     andl(len, ~(64 - 1));    // vector count (in chars)
3445     jccb(Assembler::zero, test_tail);
3446 
3447     lea(ary1, Address(ary1, len, Address::times_1));
3448     negptr(len);
3449 
3450     bind(test_64_loop);
3451     // Check whether our 64 elements of size byte contain negatives
3452     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3453     kortestql(mask1, mask1);
3454     jcc(Assembler::notZero, TRUE_LABEL);
3455 
3456     addptr(len, 64);
3457     jccb(Assembler::notZero, test_64_loop);
3458 
3459 
3460     bind(test_tail);
3461     // bail out when there is nothing to be done
3462     testl(tmp1, -1);
3463     jcc(Assembler::zero, FALSE_LABEL);
3464 
3465     // ~(~0 << len) applied up to two times (for 32-bit scenario)
3466 #ifdef _LP64
3467     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3468     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3469     notq(tmp3_aliased);
3470     kmovql(mask2, tmp3_aliased);
3471 #else
3472     Label k_init;
3473     jmp(k_init);
3474 
3475     // We could not read 64-bits from a general purpose register thus we move
3476     // data required to compose 64 1's to the instruction stream
3477     // We emit 64 byte wide series of elements from 0..63 which later on would
3478     // be used as a compare targets with tail count contained in tmp1 register.
3479     // Result would be a k register having tmp1 consecutive number or 1
3480     // counting from least significant bit.
3481     address tmp = pc();
3482     emit_int64(0x0706050403020100);
3483     emit_int64(0x0F0E0D0C0B0A0908);
3484     emit_int64(0x1716151413121110);
3485     emit_int64(0x1F1E1D1C1B1A1918);
3486     emit_int64(0x2726252423222120);
3487     emit_int64(0x2F2E2D2C2B2A2928);
3488     emit_int64(0x3736353433323130);
3489     emit_int64(0x3F3E3D3C3B3A3938);
3490 
3491     bind(k_init);
3492     lea(len, InternalAddress(tmp));
3493     // create mask to test for negative byte inside a vector
3494     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3495     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
3496 
3497 #endif
3498     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3499     ktestq(mask1, mask2);
3500     jcc(Assembler::notZero, TRUE_LABEL);
3501 
3502     jmp(FALSE_LABEL);
3503   } else {
3504     movl(result, len); // copy
3505 
3506     if (UseAVX >= 2 && UseSSE >= 2) {
3507       // With AVX2, use 32-byte vector compare
3508       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3509 
3510       // Compare 32-byte vectors
3511       andl(result, 0x0000001f);  //   tail count (in bytes)
3512       andl(len, 0xffffffe0);   // vector count (in bytes)
3513       jccb(Assembler::zero, COMPARE_TAIL);
3514 
3515       lea(ary1, Address(ary1, len, Address::times_1));
3516       negptr(len);
3517 
3518       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
3519       movdl(vec2, tmp1);
3520       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
3521 
3522       bind(COMPARE_WIDE_VECTORS);
3523       vmovdqu(vec1, Address(ary1, len, Address::times_1));
3524       vptest(vec1, vec2);
3525       jccb(Assembler::notZero, TRUE_LABEL);
3526       addptr(len, 32);
3527       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3528 
3529       testl(result, result);
3530       jccb(Assembler::zero, FALSE_LABEL);
3531 
3532       vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3533       vptest(vec1, vec2);
3534       jccb(Assembler::notZero, TRUE_LABEL);
3535       jmpb(FALSE_LABEL);
3536 
3537       bind(COMPARE_TAIL); // len is zero
3538       movl(len, result);
3539       // Fallthru to tail compare
3540     } else if (UseSSE42Intrinsics) {
3541       // With SSE4.2, use double quad vector compare
3542       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3543 
3544       // Compare 16-byte vectors
3545       andl(result, 0x0000000f);  //   tail count (in bytes)
3546       andl(len, 0xfffffff0);   // vector count (in bytes)
3547       jcc(Assembler::zero, COMPARE_TAIL);
3548 
3549       lea(ary1, Address(ary1, len, Address::times_1));
3550       negptr(len);
3551 
3552       movl(tmp1, 0x80808080);
3553       movdl(vec2, tmp1);
3554       pshufd(vec2, vec2, 0);
3555 
3556       bind(COMPARE_WIDE_VECTORS);
3557       movdqu(vec1, Address(ary1, len, Address::times_1));
3558       ptest(vec1, vec2);
3559       jcc(Assembler::notZero, TRUE_LABEL);
3560       addptr(len, 16);
3561       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3562 
3563       testl(result, result);
3564       jcc(Assembler::zero, FALSE_LABEL);
3565 
3566       movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3567       ptest(vec1, vec2);
3568       jccb(Assembler::notZero, TRUE_LABEL);
3569       jmpb(FALSE_LABEL);
3570 
3571       bind(COMPARE_TAIL); // len is zero
3572       movl(len, result);
3573       // Fallthru to tail compare
3574     }
3575   }
3576   // Compare 4-byte vectors
3577   andl(len, 0xfffffffc); // vector count (in bytes)
3578   jccb(Assembler::zero, COMPARE_CHAR);
3579 
3580   lea(ary1, Address(ary1, len, Address::times_1));
3581   negptr(len);
3582 
3583   bind(COMPARE_VECTORS);
3584   movl(tmp1, Address(ary1, len, Address::times_1));
3585   andl(tmp1, 0x80808080);
3586   jccb(Assembler::notZero, TRUE_LABEL);
3587   addptr(len, 4);
3588   jcc(Assembler::notZero, COMPARE_VECTORS);
3589 
3590   // Compare trailing char (final 2 bytes), if any
3591   bind(COMPARE_CHAR);
3592   testl(result, 0x2);   // tail  char
3593   jccb(Assembler::zero, COMPARE_BYTE);
3594   load_unsigned_short(tmp1, Address(ary1, 0));
3595   andl(tmp1, 0x00008080);
3596   jccb(Assembler::notZero, TRUE_LABEL);
3597   subptr(result, 2);
3598   lea(ary1, Address(ary1, 2));
3599 
3600   bind(COMPARE_BYTE);
3601   testl(result, 0x1);   // tail  byte
3602   jccb(Assembler::zero, FALSE_LABEL);
3603   load_unsigned_byte(tmp1, Address(ary1, 0));
3604   andl(tmp1, 0x00000080);
3605   jccb(Assembler::notEqual, TRUE_LABEL);
3606   jmpb(FALSE_LABEL);
3607 
3608   bind(TRUE_LABEL);
3609   movl(result, 1);   // return true
3610   jmpb(DONE);
3611 
3612   bind(FALSE_LABEL);
3613   xorl(result, result); // return false
3614 
3615   // That's it
3616   bind(DONE);
3617   if (UseAVX >= 2 && UseSSE >= 2) {
3618     // clean upper bits of YMM registers
3619     vpxor(vec1, vec1);
3620     vpxor(vec2, vec2);
3621   }
3622 }
3623 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
3624 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
3625                                       Register limit, Register result, Register chr,
3626                                       XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
3627   ShortBranchVerifier sbv(this);
3628   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
3629 
3630   int length_offset  = arrayOopDesc::length_offset_in_bytes();
3631   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
3632 
3633   if (is_array_equ) {
3634     // Check the input args
3635     cmpoop(ary1, ary2);
3636     jcc(Assembler::equal, TRUE_LABEL);
3637 
3638     // Need additional checks for arrays_equals.
3639     testptr(ary1, ary1);
3640     jcc(Assembler::zero, FALSE_LABEL);
3641     testptr(ary2, ary2);
3642     jcc(Assembler::zero, FALSE_LABEL);
3643 
3644     // Check the lengths
3645     movl(limit, Address(ary1, length_offset));
3646     cmpl(limit, Address(ary2, length_offset));
3647     jcc(Assembler::notEqual, FALSE_LABEL);
3648   }
3649 
3650   // count == 0
3651   testl(limit, limit);
3652   jcc(Assembler::zero, TRUE_LABEL);
3653 
3654   if (is_array_equ) {
3655     // Load array address
3656     lea(ary1, Address(ary1, base_offset));
3657     lea(ary2, Address(ary2, base_offset));
3658   }
3659 
3660   if (is_array_equ && is_char) {
3661     // arrays_equals when used for char[].
3662     shll(limit, 1);      // byte count != 0
3663   }
3664   movl(result, limit); // copy
3665 
3666   if (UseAVX >= 2) {
3667     // With AVX2, use 32-byte vector compare
3668     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3669 
3670     // Compare 32-byte vectors
3671     andl(result, 0x0000001f);  //   tail count (in bytes)
3672     andl(limit, 0xffffffe0);   // vector count (in bytes)
3673     jcc(Assembler::zero, COMPARE_TAIL);
3674 
3675     lea(ary1, Address(ary1, limit, Address::times_1));
3676     lea(ary2, Address(ary2, limit, Address::times_1));
3677     negptr(limit);
3678 
3679 #ifdef _LP64
3680     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3681       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
3682 
3683       cmpl(limit, -64);
3684       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3685 
3686       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3687 
3688       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
3689       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
3690       kortestql(mask, mask);
3691       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3692       addptr(limit, 64);  // update since we already compared at this addr
3693       cmpl(limit, -64);
3694       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3695 
3696       // At this point we may still need to compare -limit+result bytes.
3697       // We could execute the next two instruction and just continue via non-wide path:
3698       //  cmpl(limit, 0);
3699       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
3700       // But since we stopped at the points ary{1,2}+limit which are
3701       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
3702       // (|limit| <= 32 and result < 32),
3703       // we may just compare the last 64 bytes.
3704       //
3705       addptr(result, -64);   // it is safe, bc we just came from this area
3706       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
3707       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
3708       kortestql(mask, mask);
3709       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3710 
3711       jmp(TRUE_LABEL);
3712 
3713       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3714 
3715     }//if (VM_Version::supports_avx512vlbw())
3716 #endif //_LP64
3717     bind(COMPARE_WIDE_VECTORS);
3718     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
3719     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
3720     vpxor(vec1, vec2);
3721 
3722     vptest(vec1, vec1);
3723     jcc(Assembler::notZero, FALSE_LABEL);
3724     addptr(limit, 32);
3725     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3726 
3727     testl(result, result);
3728     jcc(Assembler::zero, TRUE_LABEL);
3729 
3730     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3731     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
3732     vpxor(vec1, vec2);
3733 
3734     vptest(vec1, vec1);
3735     jccb(Assembler::notZero, FALSE_LABEL);
3736     jmpb(TRUE_LABEL);
3737 
3738     bind(COMPARE_TAIL); // limit is zero
3739     movl(limit, result);
3740     // Fallthru to tail compare
3741   } else if (UseSSE42Intrinsics) {
3742     // With SSE4.2, use double quad vector compare
3743     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3744 
3745     // Compare 16-byte vectors
3746     andl(result, 0x0000000f);  //   tail count (in bytes)
3747     andl(limit, 0xfffffff0);   // vector count (in bytes)
3748     jcc(Assembler::zero, COMPARE_TAIL);
3749 
3750     lea(ary1, Address(ary1, limit, Address::times_1));
3751     lea(ary2, Address(ary2, limit, Address::times_1));
3752     negptr(limit);
3753 
3754     bind(COMPARE_WIDE_VECTORS);
3755     movdqu(vec1, Address(ary1, limit, Address::times_1));
3756     movdqu(vec2, Address(ary2, limit, Address::times_1));
3757     pxor(vec1, vec2);
3758 
3759     ptest(vec1, vec1);
3760     jcc(Assembler::notZero, FALSE_LABEL);
3761     addptr(limit, 16);
3762     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3763 
3764     testl(result, result);
3765     jcc(Assembler::zero, TRUE_LABEL);
3766 
3767     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3768     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
3769     pxor(vec1, vec2);
3770 
3771     ptest(vec1, vec1);
3772     jccb(Assembler::notZero, FALSE_LABEL);
3773     jmpb(TRUE_LABEL);
3774 
3775     bind(COMPARE_TAIL); // limit is zero
3776     movl(limit, result);
3777     // Fallthru to tail compare
3778   }
3779 
3780   // Compare 4-byte vectors
3781   andl(limit, 0xfffffffc); // vector count (in bytes)
3782   jccb(Assembler::zero, COMPARE_CHAR);
3783 
3784   lea(ary1, Address(ary1, limit, Address::times_1));
3785   lea(ary2, Address(ary2, limit, Address::times_1));
3786   negptr(limit);
3787 
3788   bind(COMPARE_VECTORS);
3789   movl(chr, Address(ary1, limit, Address::times_1));
3790   cmpl(chr, Address(ary2, limit, Address::times_1));
3791   jccb(Assembler::notEqual, FALSE_LABEL);
3792   addptr(limit, 4);
3793   jcc(Assembler::notZero, COMPARE_VECTORS);
3794 
3795   // Compare trailing char (final 2 bytes), if any
3796   bind(COMPARE_CHAR);
3797   testl(result, 0x2);   // tail  char
3798   jccb(Assembler::zero, COMPARE_BYTE);
3799   load_unsigned_short(chr, Address(ary1, 0));
3800   load_unsigned_short(limit, Address(ary2, 0));
3801   cmpl(chr, limit);
3802   jccb(Assembler::notEqual, FALSE_LABEL);
3803 
3804   if (is_array_equ && is_char) {
3805     bind(COMPARE_BYTE);
3806   } else {
3807     lea(ary1, Address(ary1, 2));
3808     lea(ary2, Address(ary2, 2));
3809 
3810     bind(COMPARE_BYTE);
3811     testl(result, 0x1);   // tail  byte
3812     jccb(Assembler::zero, TRUE_LABEL);
3813     load_unsigned_byte(chr, Address(ary1, 0));
3814     load_unsigned_byte(limit, Address(ary2, 0));
3815     cmpl(chr, limit);
3816     jccb(Assembler::notEqual, FALSE_LABEL);
3817   }
3818   bind(TRUE_LABEL);
3819   movl(result, 1);   // return true
3820   jmpb(DONE);
3821 
3822   bind(FALSE_LABEL);
3823   xorl(result, result); // return false
3824 
3825   // That's it
3826   bind(DONE);
3827   if (UseAVX >= 2) {
3828     // clean upper bits of YMM registers
3829     vpxor(vec1, vec1);
3830     vpxor(vec2, vec2);
3831   }
3832 }
3833 
3834 #ifdef _LP64
3835 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
3836                                               Register tmp, KRegister ktmp, int masklen, int vec_enc) {
3837   assert(VM_Version::supports_avx512vlbw(), "");
3838   vpxor(xtmp, xtmp, xtmp, vec_enc);
3839   vpsubb(xtmp, xtmp, mask, vec_enc);
3840   evpmovb2m(ktmp, xtmp, vec_enc);
3841   kmovql(tmp, ktmp);
3842   switch(opc) {
3843     case Op_VectorMaskTrueCount:
3844       popcntq(dst, tmp);
3845       break;
3846     case Op_VectorMaskLastTrue:
3847       mov64(dst, -1);
3848       bsrq(tmp, tmp);
3849       cmov(Assembler::notZero, dst, tmp);
3850       break;
3851     case Op_VectorMaskFirstTrue:
3852       mov64(dst, masklen);
3853       bsfq(tmp, tmp);
3854       cmov(Assembler::notZero, dst, tmp);
3855       break;
3856     default: assert(false, "Unhandled mask operation");
3857   }
3858 }
3859 
3860 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
3861                                               XMMRegister xtmp1, Register tmp, int masklen, int vec_enc) {
3862   assert(VM_Version::supports_avx(), "");
3863   vpxor(xtmp, xtmp, xtmp, vec_enc);
3864   vpsubb(xtmp, xtmp, mask, vec_enc);
3865   vpmovmskb(tmp, xtmp, vec_enc);
3866   if (masklen < 64) {
3867     andq(tmp, (((jlong)1 << masklen) - 1));
3868   }
3869   switch(opc) {
3870     case Op_VectorMaskTrueCount:
3871       popcntq(dst, tmp);
3872       break;
3873     case Op_VectorMaskLastTrue:
3874       mov64(dst, -1);
3875       bsrq(tmp, tmp);
3876       cmov(Assembler::notZero, dst, tmp);
3877       break;
3878     case Op_VectorMaskFirstTrue:
3879       mov64(dst, masklen);
3880       bsfq(tmp, tmp);
3881       cmov(Assembler::notZero, dst, tmp);
3882       break;
3883     default: assert(false, "Unhandled mask operation");
3884   }
3885 }
3886 #endif