1 /*
   2  * Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "oops/methodData.hpp"
  29 #include "opto/c2_CodeStubs.hpp"
  30 #include "opto/c2_MacroAssembler.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/opcodes.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/subnode.hpp"
  35 #include "runtime/biasedLocking.hpp"
  36 #include "runtime/globals.hpp"
  37 #include "runtime/objectMonitor.hpp"
  38 #include "runtime/stubRoutines.hpp"
  39 #include "utilities/globalDefinitions.hpp"
  40 #include "utilities/powerOfTwo.hpp"
  41 #include "utilities/sizes.hpp"
  42 
  43 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
  44   switch (vlen_in_bytes) {
  45     case  4: // fall-through
  46     case  8: // fall-through
  47     case 16: return Assembler::AVX_128bit;
  48     case 32: return Assembler::AVX_256bit;
  49     case 64: return Assembler::AVX_512bit;
  50 
  51     default: {
  52       ShouldNotReachHere();
  53       return Assembler::AVX_NoVec;
  54     }
  55   }
  56 }
  57 
  58 void C2_MacroAssembler::setvectmask(Register dst, Register src, KRegister mask) {
  59   guarantee(PostLoopMultiversioning, "must be");
  60   Assembler::movl(dst, 1);
  61   Assembler::shlxl(dst, dst, src);
  62   Assembler::decl(dst);
  63   Assembler::kmovdl(mask, dst);
  64   Assembler::movl(dst, src);
  65 }
  66 
  67 void C2_MacroAssembler::restorevectmask(KRegister mask) {
  68   guarantee(PostLoopMultiversioning, "must be");
  69   Assembler::knotwl(mask, k0);
  70 }
  71 
  72 #if INCLUDE_RTM_OPT
  73 
  74 // Update rtm_counters based on abort status
  75 // input: abort_status
  76 //        rtm_counters (RTMLockingCounters*)
  77 // flags are killed
  78 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
  79 
  80   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
  81   if (PrintPreciseRTMLockingStatistics) {
  82     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
  83       Label check_abort;
  84       testl(abort_status, (1<<i));
  85       jccb(Assembler::equal, check_abort);
  86       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
  87       bind(check_abort);
  88     }
  89   }
  90 }
  91 
  92 // Branch if (random & (count-1) != 0), count is 2^n
  93 // tmp, scr and flags are killed
  94 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
  95   assert(tmp == rax, "");
  96   assert(scr == rdx, "");
  97   rdtsc(); // modifies EDX:EAX
  98   andptr(tmp, count-1);
  99   jccb(Assembler::notZero, brLabel);
 100 }
 101 
 102 // Perform abort ratio calculation, set no_rtm bit if high ratio
 103 // input:  rtm_counters_Reg (RTMLockingCounters* address)
 104 // tmpReg, rtm_counters_Reg and flags are killed
 105 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
 106                                                     Register rtm_counters_Reg,
 107                                                     RTMLockingCounters* rtm_counters,
 108                                                     Metadata* method_data) {
 109   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 110 
 111   if (RTMLockingCalculationDelay > 0) {
 112     // Delay calculation
 113     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
 114     testptr(tmpReg, tmpReg);
 115     jccb(Assembler::equal, L_done);
 116   }
 117   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 118   //   Aborted transactions = abort_count * 100
 119   //   All transactions = total_count *  RTMTotalCountIncrRate
 120   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 121 
 122   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 123   cmpptr(tmpReg, RTMAbortThreshold);
 124   jccb(Assembler::below, L_check_always_rtm2);
 125   imulptr(tmpReg, tmpReg, 100);
 126 
 127   Register scrReg = rtm_counters_Reg;
 128   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 129   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 130   imulptr(scrReg, scrReg, RTMAbortRatio);
 131   cmpptr(tmpReg, scrReg);
 132   jccb(Assembler::below, L_check_always_rtm1);
 133   if (method_data != NULL) {
 134     // set rtm_state to "no rtm" in MDO
 135     mov_metadata(tmpReg, method_data);
 136     lock();
 137     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
 138   }
 139   jmpb(L_done);
 140   bind(L_check_always_rtm1);
 141   // Reload RTMLockingCounters* address
 142   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 143   bind(L_check_always_rtm2);
 144   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 145   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 146   jccb(Assembler::below, L_done);
 147   if (method_data != NULL) {
 148     // set rtm_state to "always rtm" in MDO
 149     mov_metadata(tmpReg, method_data);
 150     lock();
 151     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
 152   }
 153   bind(L_done);
 154 }
 155 
 156 // Update counters and perform abort ratio calculation
 157 // input:  abort_status_Reg
 158 // rtm_counters_Reg, flags are killed
 159 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 160                                       Register rtm_counters_Reg,
 161                                       RTMLockingCounters* rtm_counters,
 162                                       Metadata* method_data,
 163                                       bool profile_rtm) {
 164 
 165   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 166   // update rtm counters based on rax value at abort
 167   // reads abort_status_Reg, updates flags
 168   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 169   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 170   if (profile_rtm) {
 171     // Save abort status because abort_status_Reg is used by following code.
 172     if (RTMRetryCount > 0) {
 173       push(abort_status_Reg);
 174     }
 175     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 176     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 177     // restore abort status
 178     if (RTMRetryCount > 0) {
 179       pop(abort_status_Reg);
 180     }
 181   }
 182 }
 183 
 184 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 185 // inputs: retry_count_Reg
 186 //       : abort_status_Reg
 187 // output: retry_count_Reg decremented by 1
 188 // flags are killed
 189 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 190   Label doneRetry;
 191   assert(abort_status_Reg == rax, "");
 192   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 193   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 194   // if reason is in 0x6 and retry count != 0 then retry
 195   andptr(abort_status_Reg, 0x6);
 196   jccb(Assembler::zero, doneRetry);
 197   testl(retry_count_Reg, retry_count_Reg);
 198   jccb(Assembler::zero, doneRetry);
 199   pause();
 200   decrementl(retry_count_Reg);
 201   jmp(retryLabel);
 202   bind(doneRetry);
 203 }
 204 
 205 // Spin and retry if lock is busy,
 206 // inputs: box_Reg (monitor address)
 207 //       : retry_count_Reg
 208 // output: retry_count_Reg decremented by 1
 209 //       : clear z flag if retry count exceeded
 210 // tmp_Reg, scr_Reg, flags are killed
 211 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 212                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 213   Label SpinLoop, SpinExit, doneRetry;
 214   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 215 
 216   testl(retry_count_Reg, retry_count_Reg);
 217   jccb(Assembler::zero, doneRetry);
 218   decrementl(retry_count_Reg);
 219   movptr(scr_Reg, RTMSpinLoopCount);
 220 
 221   bind(SpinLoop);
 222   pause();
 223   decrementl(scr_Reg);
 224   jccb(Assembler::lessEqual, SpinExit);
 225   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 226   testptr(tmp_Reg, tmp_Reg);
 227   jccb(Assembler::notZero, SpinLoop);
 228 
 229   bind(SpinExit);
 230   jmp(retryLabel);
 231   bind(doneRetry);
 232   incrementl(retry_count_Reg); // clear z flag
 233 }
 234 
 235 // Use RTM for normal stack locks
 236 // Input: objReg (object to lock)
 237 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 238                                          Register retry_on_abort_count_Reg,
 239                                          RTMLockingCounters* stack_rtm_counters,
 240                                          Metadata* method_data, bool profile_rtm,
 241                                          Label& DONE_LABEL, Label& IsInflated) {
 242   assert(UseRTMForStackLocks, "why call this otherwise?");
 243   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
 244   assert(tmpReg == rax, "");
 245   assert(scrReg == rdx, "");
 246   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 247 
 248   if (RTMRetryCount > 0) {
 249     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 250     bind(L_rtm_retry);
 251   }
 252   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 253   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral|biased
 254   jcc(Assembler::notZero, IsInflated);
 255 
 256   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 257     Label L_noincrement;
 258     if (RTMTotalCountIncrRate > 1) {
 259       // tmpReg, scrReg and flags are killed
 260       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 261     }
 262     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
 263     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 264     bind(L_noincrement);
 265   }
 266   xbegin(L_on_abort);
 267   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 268   andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits
 269   cmpptr(tmpReg, markWord::unlocked_value);            // bits = 001 unlocked
 270   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 271 
 272   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 273   if (UseRTMXendForLockBusy) {
 274     xend();
 275     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 276     jmp(L_decrement_retry);
 277   }
 278   else {
 279     xabort(0);
 280   }
 281   bind(L_on_abort);
 282   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 283     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 284   }
 285   bind(L_decrement_retry);
 286   if (RTMRetryCount > 0) {
 287     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 288     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 289   }
 290 }
 291 
 292 // Use RTM for inflating locks
 293 // inputs: objReg (object to lock)
 294 //         boxReg (on-stack box address (displaced header location) - KILLED)
 295 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 296 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 297                                             Register scrReg, Register retry_on_busy_count_Reg,
 298                                             Register retry_on_abort_count_Reg,
 299                                             RTMLockingCounters* rtm_counters,
 300                                             Metadata* method_data, bool profile_rtm,
 301                                             Label& DONE_LABEL) {
 302   assert(UseRTMLocking, "why call this otherwise?");
 303   assert(tmpReg == rax, "");
 304   assert(scrReg == rdx, "");
 305   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 306   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 307 
 308   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 309   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
 310   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 311 
 312   if (RTMRetryCount > 0) {
 313     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 314     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 315     bind(L_rtm_retry);
 316   }
 317   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 318     Label L_noincrement;
 319     if (RTMTotalCountIncrRate > 1) {
 320       // tmpReg, scrReg and flags are killed
 321       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 322     }
 323     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 324     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 325     bind(L_noincrement);
 326   }
 327   xbegin(L_on_abort);
 328   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 329   movptr(tmpReg, Address(tmpReg, owner_offset));
 330   testptr(tmpReg, tmpReg);
 331   jcc(Assembler::zero, DONE_LABEL);
 332   if (UseRTMXendForLockBusy) {
 333     xend();
 334     jmp(L_decrement_retry);
 335   }
 336   else {
 337     xabort(0);
 338   }
 339   bind(L_on_abort);
 340   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 341   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 342     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 343   }
 344   if (RTMRetryCount > 0) {
 345     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 346     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 347   }
 348 
 349   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 350   testptr(tmpReg, tmpReg) ;
 351   jccb(Assembler::notZero, L_decrement_retry) ;
 352 
 353   // Appears unlocked - try to swing _owner from null to non-null.
 354   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 355 #ifdef _LP64
 356   Register threadReg = r15_thread;
 357 #else
 358   get_thread(scrReg);
 359   Register threadReg = scrReg;
 360 #endif
 361   lock();
 362   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 363 
 364   if (RTMRetryCount > 0) {
 365     // success done else retry
 366     jccb(Assembler::equal, DONE_LABEL) ;
 367     bind(L_decrement_retry);
 368     // Spin and retry if lock is busy.
 369     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 370   }
 371   else {
 372     bind(L_decrement_retry);
 373   }
 374 }
 375 
 376 #endif //  INCLUDE_RTM_OPT
 377 
 378 // fast_lock and fast_unlock used by C2
 379 
 380 // Because the transitions from emitted code to the runtime
 381 // monitorenter/exit helper stubs are so slow it's critical that
 382 // we inline both the stack-locking fast path and the inflated fast path.
 383 //
 384 // See also: cmpFastLock and cmpFastUnlock.
 385 //
 386 // What follows is a specialized inline transliteration of the code
 387 // in enter() and exit(). If we're concerned about I$ bloat another
 388 // option would be to emit TrySlowEnter and TrySlowExit methods
 389 // at startup-time.  These methods would accept arguments as
 390 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 391 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 392 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 393 // In practice, however, the # of lock sites is bounded and is usually small.
 394 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 395 // if the processor uses simple bimodal branch predictors keyed by EIP
 396 // Since the helper routines would be called from multiple synchronization
 397 // sites.
 398 //
 399 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 400 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 401 // to those specialized methods.  That'd give us a mostly platform-independent
 402 // implementation that the JITs could optimize and inline at their pleasure.
 403 // Done correctly, the only time we'd need to cross to native could would be
 404 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 405 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 406 // (b) explicit barriers or fence operations.
 407 //
 408 // TODO:
 409 //
 410 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 411 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 412 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 413 //    the lock operators would typically be faster than reifying Self.
 414 //
 415 // *  Ideally I'd define the primitives as:
 416 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 417 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 418 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 419 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 420 //    Furthermore the register assignments are overconstrained, possibly resulting in
 421 //    sub-optimal code near the synchronization site.
 422 //
 423 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 424 //    Alternately, use a better sp-proximity test.
 425 //
 426 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 427 //    Either one is sufficient to uniquely identify a thread.
 428 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 429 //
 430 // *  Intrinsify notify() and notifyAll() for the common cases where the
 431 //    object is locked by the calling thread but the waitlist is empty.
 432 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 433 //
 434 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 435 //    But beware of excessive branch density on AMD Opterons.
 436 //
 437 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 438 //    or failure of the fast path.  If the fast path fails then we pass
 439 //    control to the slow path, typically in C.  In fast_lock and
 440 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 441 //    will emit a conditional branch immediately after the node.
 442 //    So we have branches to branches and lots of ICC.ZF games.
 443 //    Instead, it might be better to have C2 pass a "FailureLabel"
 444 //    into fast_lock and fast_unlock.  In the case of success, control
 445 //    will drop through the node.  ICC.ZF is undefined at exit.
 446 //    In the case of failure, the node will branch directly to the
 447 //    FailureLabel
 448 
 449 
 450 // obj: object to lock
 451 // box: on-stack box address (displaced header location) - KILLED
 452 // rax,: tmp -- KILLED
 453 // scr: tmp -- KILLED
 454 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 455                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 456                                  BiasedLockingCounters* counters,
 457                                  RTMLockingCounters* rtm_counters,
 458                                  RTMLockingCounters* stack_rtm_counters,
 459                                  Metadata* method_data,
 460                                  bool use_rtm, bool profile_rtm) {
 461   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 462   // Ensure the register assignments are disjoint
 463   assert(tmpReg == rax, "");
 464 
 465   if (use_rtm) {
 466     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 467   } else {
 468     assert(cx2Reg == noreg, "");
 469     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 470   }
 471 
 472   if (counters != NULL) {
 473     atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
 474   }
 475 
 476   // Possible cases that we'll encounter in fast_lock
 477   // ------------------------------------------------
 478   // * Inflated
 479   //    -- unlocked
 480   //    -- Locked
 481   //       = by self
 482   //       = by other
 483   // * biased
 484   //    -- by Self
 485   //    -- by other
 486   // * neutral
 487   // * stack-locked
 488   //    -- by self
 489   //       = sp-proximity test hits
 490   //       = sp-proximity test generates false-negative
 491   //    -- by other
 492   //
 493 
 494   Label IsInflated, DONE_LABEL;
 495 
 496   if (DiagnoseSyncOnValueBasedClasses != 0) {
 497     load_klass(tmpReg, objReg, cx1Reg);
 498     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 499     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 500     jcc(Assembler::notZero, DONE_LABEL);
 501   }
 502 
 503   // it's stack-locked, biased or neutral
 504   // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
 505   // order to reduce the number of conditional branches in the most common cases.
 506   // Beware -- there's a subtle invariant that fetch of the markword
 507   // at [FETCH], below, will never observe a biased encoding (*101b).
 508   // If this invariant is not held we risk exclusion (safety) failure.
 509   if (UseBiasedLocking && !UseOptoBiasInlining) {
 510     biased_locking_enter(boxReg, objReg, tmpReg, scrReg, cx1Reg, false, DONE_LABEL, NULL, counters);
 511   }
 512 
 513 #if INCLUDE_RTM_OPT
 514   if (UseRTMForStackLocks && use_rtm) {
 515     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 516                       stack_rtm_counters, method_data, profile_rtm,
 517                       DONE_LABEL, IsInflated);
 518   }
 519 #endif // INCLUDE_RTM_OPT
 520 
 521   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 522   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased
 523   jcc(Assembler::notZero, IsInflated);
 524 
 525   if (LockingMode == LM_MONITOR) {
 526     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 527     testptr(objReg, objReg);
 528   } else {
 529     assert(LockingMode == LM_LEGACY, "must be");
 530     // Attempt stack-locking ...
 531     orptr (tmpReg, markWord::unlocked_value);
 532     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 533     lock();
 534     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 535     if (counters != NULL) {
 536       cond_inc32(Assembler::equal,
 537                  ExternalAddress((address)counters->fast_path_entry_count_addr()));
 538     }
 539     jcc(Assembler::equal, DONE_LABEL);           // Success
 540 
 541     // Recursive locking.
 542     // The object is stack-locked: markword contains stack pointer to BasicLock.
 543     // Locked by current thread if difference with current SP is less than one page.
 544     subptr(tmpReg, rsp);
 545     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 546     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
 547     movptr(Address(boxReg, 0), tmpReg);
 548     if (counters != NULL) {
 549       cond_inc32(Assembler::equal,
 550                  ExternalAddress((address)counters->fast_path_entry_count_addr()));
 551     }
 552   }
 553   jmp(DONE_LABEL);
 554 
 555   bind(IsInflated);
 556   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 557 
 558 #if INCLUDE_RTM_OPT
 559   // Use the same RTM locking code in 32- and 64-bit VM.
 560   if (use_rtm) {
 561     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 562                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 563   } else {
 564 #endif // INCLUDE_RTM_OPT
 565 
 566 #ifndef _LP64
 567   // The object is inflated.
 568 
 569   // boxReg refers to the on-stack BasicLock in the current frame.
 570   // We'd like to write:
 571   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 572   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 573   // additional latency as we have another ST in the store buffer that must drain.
 574 
 575   // avoid ST-before-CAS
 576   // register juggle because we need tmpReg for cmpxchgptr below
 577   movptr(scrReg, boxReg);
 578   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 579 
 580   // Optimistic form: consider XORL tmpReg,tmpReg
 581   movptr(tmpReg, NULL_WORD);
 582 
 583   // Appears unlocked - try to swing _owner from null to non-null.
 584   // Ideally, I'd manifest "Self" with get_thread and then attempt
 585   // to CAS the register containing Self into m->Owner.
 586   // But we don't have enough registers, so instead we can either try to CAS
 587   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 588   // we later store "Self" into m->Owner.  Transiently storing a stack address
 589   // (rsp or the address of the box) into  m->owner is harmless.
 590   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 591   lock();
 592   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 593   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 594   // If we weren't able to swing _owner from NULL to the BasicLock
 595   // then take the slow path.
 596   jccb  (Assembler::notZero, DONE_LABEL);
 597   // update _owner from BasicLock to thread
 598   get_thread (scrReg);                    // beware: clobbers ICCs
 599   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 600   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 601 
 602   // If the CAS fails we can either retry or pass control to the slow path.
 603   // We use the latter tactic.
 604   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 605   // If the CAS was successful ...
 606   //   Self has acquired the lock
 607   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 608   // Intentional fall-through into DONE_LABEL ...
 609 #else // _LP64
 610   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 611   movq(scrReg, tmpReg);
 612   xorq(tmpReg, tmpReg);
 613   lock();
 614   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 615   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 616   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 617   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
 618   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 619   jcc(Assembler::equal, DONE_LABEL);           // CAS above succeeded; propagate ZF = 1 (success)
 620 
 621   cmpptr(r15_thread, rax);                     // Check if we are already the owner (recursive lock)
 622   jcc(Assembler::notEqual, DONE_LABEL);        // If not recursive, ZF = 0 at this point (fail)
 623   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 624   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 625 #endif // _LP64
 626 #if INCLUDE_RTM_OPT
 627   } // use_rtm()
 628 #endif
 629   // DONE_LABEL is a hot target - we'd really like to place it at the
 630   // start of cache line by padding with NOPs.
 631   // See the AMD and Intel software optimization manuals for the
 632   // most efficient "long" NOP encodings.
 633   // Unfortunately none of our alignment mechanisms suffice.
 634   bind(DONE_LABEL);
 635 
 636   // At DONE_LABEL the icc ZFlag is set as follows ...
 637   // fast_unlock uses the same protocol.
 638   // ZFlag == 1 -> Success
 639   // ZFlag == 0 -> Failure - force control through the slow path
 640 }
 641 
 642 // obj: object to unlock
 643 // box: box address (displaced header location), killed.  Must be EAX.
 644 // tmp: killed, cannot be obj nor box.
 645 //
 646 // Some commentary on balanced locking:
 647 //
 648 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 649 // Methods that don't have provably balanced locking are forced to run in the
 650 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 651 // The interpreter provides two properties:
 652 // I1:  At return-time the interpreter automatically and quietly unlocks any
 653 //      objects acquired the current activation (frame).  Recall that the
 654 //      interpreter maintains an on-stack list of locks currently held by
 655 //      a frame.
 656 // I2:  If a method attempts to unlock an object that is not held by the
 657 //      the frame the interpreter throws IMSX.
 658 //
 659 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 660 // B() doesn't have provably balanced locking so it runs in the interpreter.
 661 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 662 // is still locked by A().
 663 //
 664 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 665 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 666 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 667 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 668 // Arguably given that the spec legislates the JNI case as undefined our implementation
 669 // could reasonably *avoid* checking owner in fast_unlock().
 670 // In the interest of performance we elide m->Owner==Self check in unlock.
 671 // A perfectly viable alternative is to elide the owner check except when
 672 // Xcheck:jni is enabled.
 673 
 674 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 675   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 676   assert(boxReg == rax, "");
 677   assert_different_registers(objReg, boxReg, tmpReg);
 678 
 679   Label DONE_LABEL, Stacked, CheckSucc;
 680 
 681   // Critically, the biased locking test must have precedence over
 682   // and appear before the (box->dhw == 0) recursive stack-lock test.
 683   if (UseBiasedLocking && !UseOptoBiasInlining) {
 684     biased_locking_exit(objReg, tmpReg, DONE_LABEL);
 685   }
 686 
 687 #if INCLUDE_RTM_OPT
 688   if (UseRTMForStackLocks && use_rtm) {
 689     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
 690     Label L_regular_unlock;
 691     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 692     andptr(tmpReg, markWord::biased_lock_mask_in_place);              // look at 3 lock bits
 693     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 001 unlocked
 694     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 695     xend();                                                           // otherwise end...
 696     jmp(DONE_LABEL);                                                  // ... and we're done
 697     bind(L_regular_unlock);
 698   }
 699 #endif
 700 
 701   if (LockingMode == LM_LEGACY) {
 702     cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD);                   // Examine the displaced header
 703     jcc   (Assembler::zero, DONE_LABEL);                              // 0 indicates recursive stack-lock
 704   }
 705   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
 706   if (LockingMode != LM_MONITOR) {
 707     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 708     jcc(Assembler::zero, Stacked);
 709   }
 710 
 711   // It's inflated.
 712 
 713 #if INCLUDE_RTM_OPT
 714   if (use_rtm) {
 715     Label L_regular_inflated_unlock;
 716     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 717     movptr(boxReg, Address(tmpReg, owner_offset));
 718     testptr(boxReg, boxReg);
 719     jccb(Assembler::notZero, L_regular_inflated_unlock);
 720     xend();
 721     jmp(DONE_LABEL);
 722     bind(L_regular_inflated_unlock);
 723   }
 724 #endif
 725 
 726   // Despite our balanced locking property we still check that m->_owner == Self
 727   // as java routines or native JNI code called by this thread might
 728   // have released the lock.
 729   // Refer to the comments in synchronizer.cpp for how we might encode extra
 730   // state in _succ so we can avoid fetching EntryList|cxq.
 731   //
 732   // If there's no contention try a 1-0 exit.  That is, exit without
 733   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 734   // we detect and recover from the race that the 1-0 exit admits.
 735   //
 736   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 737   // before it STs null into _owner, releasing the lock.  Updates
 738   // to data protected by the critical section must be visible before
 739   // we drop the lock (and thus before any other thread could acquire
 740   // the lock and observe the fields protected by the lock).
 741   // IA32's memory-model is SPO, so STs are ordered with respect to
 742   // each other and there's no need for an explicit barrier (fence).
 743   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 744 #ifndef _LP64
 745   get_thread (boxReg);
 746 
 747   // Note that we could employ various encoding schemes to reduce
 748   // the number of loads below (currently 4) to just 2 or 3.
 749   // Refer to the comments in synchronizer.cpp.
 750   // In practice the chain of fetches doesn't seem to impact performance, however.
 751   xorptr(boxReg, boxReg);
 752   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 753   jccb  (Assembler::notZero, DONE_LABEL);
 754   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 755   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 756   jccb  (Assembler::notZero, DONE_LABEL);
 757   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 758   jmpb  (DONE_LABEL);
 759 
 760   // Intention fall-thru into DONE_LABEL
 761 
 762   // DONE_LABEL is a hot target - we'd really like to place it at the
 763   // start of cache line by padding with NOPs.
 764   // See the AMD and Intel software optimization manuals for the
 765   // most efficient "long" NOP encodings.
 766   // Unfortunately none of our alignment mechanisms suffice.
 767   bind (CheckSucc);
 768 #else // _LP64
 769   // It's inflated
 770   Label LNotRecursive, LSuccess, LGoSlowPath;
 771 
 772   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 773   jccb(Assembler::equal, LNotRecursive);
 774 
 775   // Recursive inflated unlock
 776   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 777   jmpb(LSuccess);
 778 
 779   bind(LNotRecursive);
 780   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 781   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 782   jccb  (Assembler::notZero, CheckSucc);
 783   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 784   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 785   jmpb  (DONE_LABEL);
 786 
 787   // Try to avoid passing control into the slow_path ...
 788   bind  (CheckSucc);
 789 
 790   // The following optional optimization can be elided if necessary
 791   // Effectively: if (succ == null) goto slow path
 792   // The code reduces the window for a race, however,
 793   // and thus benefits performance.
 794   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 795   jccb  (Assembler::zero, LGoSlowPath);
 796 
 797   xorptr(boxReg, boxReg);
 798   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 799   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 800 
 801   // Memory barrier/fence
 802   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 803   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 804   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 805   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 806   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 807   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 808   lock(); addl(Address(rsp, 0), 0);
 809 
 810   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 811   jccb  (Assembler::notZero, LSuccess);
 812 
 813   // Rare inopportune interleaving - race.
 814   // The successor vanished in the small window above.
 815   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 816   // We need to ensure progress and succession.
 817   // Try to reacquire the lock.
 818   // If that fails then the new owner is responsible for succession and this
 819   // thread needs to take no further action and can exit via the fast path (success).
 820   // If the re-acquire succeeds then pass control into the slow path.
 821   // As implemented, this latter mode is horrible because we generated more
 822   // coherence traffic on the lock *and* artifically extended the critical section
 823   // length while by virtue of passing control into the slow path.
 824 
 825   // box is really RAX -- the following CMPXCHG depends on that binding
 826   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 827   lock();
 828   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 829   // There's no successor so we tried to regrab the lock.
 830   // If that didn't work, then another thread grabbed the
 831   // lock so we're done (and exit was a success).
 832   jccb  (Assembler::notEqual, LSuccess);
 833   // Intentional fall-through into slow path
 834 
 835   bind  (LGoSlowPath);
 836   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 837   jmpb  (DONE_LABEL);
 838 
 839   bind  (LSuccess);
 840   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 841   jmpb  (DONE_LABEL);
 842 
 843 #endif
 844   if (LockingMode == LM_LEGACY) {
 845     bind  (Stacked);
 846     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 847     lock();
 848     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 849     // Intentional fall-thru into DONE_LABEL
 850   }
 851 
 852   bind(DONE_LABEL);
 853 }
 854 
 855 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 856                                               Register t, Register thread) {
 857   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 858   assert(rax_reg == rax, "Used for CAS");
 859   assert_different_registers(obj, box, rax_reg, t, thread);
 860 
 861   // Handle inflated monitor.
 862   Label inflated;
 863   // Finish fast lock successfully. ZF value is irrelevant.
 864   Label locked;
 865   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 866   Label slow_path;
 867 
 868   if (DiagnoseSyncOnValueBasedClasses != 0) {
 869     load_klass(rax_reg, obj, t);
 870     movl(rax_reg, Address(rax_reg, Klass::access_flags_offset()));
 871     testl(rax_reg, JVM_ACC_IS_VALUE_BASED_CLASS);
 872     jcc(Assembler::notZero, slow_path);
 873   }
 874 
 875   const Register mark = t;
 876 
 877   { // Lightweight Lock
 878 
 879     Label push;
 880 
 881     const Register top = box;
 882 
 883     // Load the mark.
 884     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 885 
 886     // Prefetch top.
 887     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 888 
 889     // Check for monitor (0b10).
 890     testptr(mark, markWord::monitor_value);
 891     jcc(Assembler::notZero, inflated);
 892 
 893     // Check if lock-stack is full.
 894     cmpl(top, LockStack::end_offset() - 1);
 895     jcc(Assembler::greater, slow_path);
 896 
 897     // Check if recursive.
 898     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 899     jccb(Assembler::equal, push);
 900 
 901     // Try to lock. Transition lock bits 0b01 => 0b00
 902     movptr(rax_reg, mark);
 903     orptr(rax_reg, markWord::unlocked_value);
 904     andptr(mark, ~(int32_t)markWord::unlocked_value);
 905     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 906     jcc(Assembler::notEqual, slow_path);
 907 
 908     bind(push);
 909     // After successful lock, push object on lock-stack.
 910     movptr(Address(thread, top), obj);
 911     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 912     jmpb(locked);
 913   }
 914 
 915   { // Handle inflated monitor.
 916     bind(inflated);
 917 
 918     const Register tagged_monitor = mark;
 919 
 920     // CAS owner (null => current thread).
 921     xorptr(rax_reg, rax_reg);
 922     lock(); cmpxchgptr(thread, Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 923     jccb(Assembler::equal, locked);
 924 
 925     // Check if recursive.
 926     cmpptr(thread, rax_reg);
 927     jccb(Assembler::notEqual, slow_path);
 928 
 929     // Recursive.
 930     increment(Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 931   }
 932 
 933   bind(locked);
 934   // Set ZF = 1
 935   xorl(rax_reg, rax_reg);
 936 
 937 #ifdef ASSERT
 938   // Check that locked label is reached with ZF set.
 939   Label zf_correct;
 940   jccb(Assembler::zero, zf_correct);
 941   stop("Fast Lock ZF != 1");
 942 #endif
 943 
 944   bind(slow_path);
 945 #ifdef ASSERT
 946   // Check that slow_path label is reached with ZF not set.
 947   jccb(Assembler::notZero, zf_correct);
 948   stop("Fast Lock ZF != 0");
 949   bind(zf_correct);
 950 #endif
 951   // C2 uses the value of ZF to determine the continuation.
 952 }
 953 
 954 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
 955   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 956   assert(reg_rax == rax, "Used for CAS");
 957   assert_different_registers(obj, reg_rax, t);
 958 
 959   // Handle inflated monitor.
 960   Label inflated, inflated_check_lock_stack;
 961   // Finish fast unlock successfully.  MUST jump with ZF == 1
 962   Label unlocked;
 963 
 964   const Register mark = t;
 965   const Register top = reg_rax;
 966 
 967   Label dummy;
 968   C2FastUnlockLightweightStub* stub = nullptr;
 969 
 970   if (!Compile::current()->output()->in_scratch_emit_size()) {
 971     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
 972     Compile::current()->output()->add_stub(stub);
 973   }
 974 
 975   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
 976   Label& check_successor = stub == nullptr ? dummy : stub->check_successor();
 977 
 978   { // Lightweight Unlock
 979 
 980     // Load top.
 981     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 982 
 983     // Prefetch mark.
 984     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 985 
 986     // Check if obj is top of lock-stack.
 987     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 988     // Top of lock stack was not obj. Must be monitor.
 989     jcc(Assembler::notEqual, inflated_check_lock_stack);
 990 
 991     // Pop lock-stack.
 992     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
 993     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 994 
 995     // Check if recursive.
 996     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
 997     jcc(Assembler::equal, unlocked);
 998 
 999     // We elide the monitor check, let the CAS fail instead.
1000 
1001     // Try to unlock. Transition lock bits 0b00 => 0b01
1002     movptr(reg_rax, mark);
1003     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
1004     orptr(mark, markWord::unlocked_value);
1005     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
1006     jcc(Assembler::notEqual, push_and_slow_path);
1007     jmp(unlocked);
1008   }
1009 
1010 
1011   { // Handle inflated monitor.
1012     bind(inflated_check_lock_stack);
1013 #ifdef ASSERT
1014     Label check_done;
1015     subl(top, oopSize);
1016     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
1017     jcc(Assembler::below, check_done);
1018     cmpptr(obj, Address(thread, top));
1019     jccb(Assembler::notEqual, inflated_check_lock_stack);
1020     stop("Fast Unlock lock on stack");
1021     bind(check_done);
1022     testptr(mark, markWord::monitor_value);
1023     jccb(Assembler::notZero, inflated);
1024     stop("Fast Unlock not monitor");
1025 #endif
1026 
1027     bind(inflated);
1028 
1029     // mark contains the tagged ObjectMonitor*.
1030     const Register monitor = mark;
1031 
1032 #ifndef _LP64
1033     // Check if recursive.
1034     xorptr(reg_rax, reg_rax);
1035     orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1036     jcc(Assembler::notZero, check_successor);
1037 
1038     // Check if the entry lists are empty.
1039     movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
1040     orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
1041     jcc(Assembler::notZero, check_successor);
1042 
1043     // Release lock.
1044     movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
1045 #else // _LP64
1046     Label recursive;
1047 
1048     // Check if recursive.
1049     cmpptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
1050     jccb(Assembler::notEqual, recursive);
1051 
1052     // Check if the entry lists are empty.
1053     movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
1054     orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
1055     jcc(Assembler::notZero, check_successor);
1056 
1057     // Release lock.
1058     movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
1059     jmpb(unlocked);
1060 
1061     // Recursive unlock.
1062     bind(recursive);
1063     decrement(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1064     xorl(t, t);
1065 #endif
1066   }
1067 
1068   bind(unlocked);
1069   if (stub != nullptr) {
1070     bind(stub->unlocked_continuation());
1071   }
1072 
1073 #ifdef ASSERT
1074   // Check that unlocked label is reached with ZF set.
1075   Label zf_correct;
1076   jccb(Assembler::zero, zf_correct);
1077   stop("Fast Unlock ZF != 1");
1078 #endif
1079 
1080   if (stub != nullptr) {
1081     bind(stub->slow_path_continuation());
1082   }
1083 #ifdef ASSERT
1084   // Check that stub->continuation() label is reached with ZF not set.
1085   jccb(Assembler::notZero, zf_correct);
1086   stop("Fast Unlock ZF != 0");
1087   bind(zf_correct);
1088 #endif
1089   // C2 uses the value of ZF to determine the continuation.
1090 }
1091 
1092 //-------------------------------------------------------------------------------------------
1093 // Generic instructions support for use in .ad files C2 code generation
1094 
1095 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
1096   if (dst != src) {
1097     movdqu(dst, src);
1098   }
1099   if (opcode == Op_AbsVD) {
1100     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr);
1101   } else {
1102     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
1103     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr);
1104   }
1105 }
1106 
1107 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
1108   if (opcode == Op_AbsVD) {
1109     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr);
1110   } else {
1111     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
1112     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr);
1113   }
1114 }
1115 
1116 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
1117   if (dst != src) {
1118     movdqu(dst, src);
1119   }
1120   if (opcode == Op_AbsVF) {
1121     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);
1122   } else {
1123     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
1124     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);
1125   }
1126 }
1127 
1128 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
1129   if (opcode == Op_AbsVF) {
1130     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);
1131   } else {
1132     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
1133     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
1134   }
1135 }
1136 
1137 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
1138   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1139   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
1140 
1141   if (opcode == Op_MinV) {
1142     if (elem_bt == T_BYTE) {
1143       pminsb(dst, src);
1144     } else if (elem_bt == T_SHORT) {
1145       pminsw(dst, src);
1146     } else if (elem_bt == T_INT) {
1147       pminsd(dst, src);
1148     } else {
1149       assert(elem_bt == T_LONG, "required");
1150       assert(tmp == xmm0, "required");
1151       assert_different_registers(dst, src, tmp);
1152       movdqu(xmm0, dst);
1153       pcmpgtq(xmm0, src);
1154       blendvpd(dst, src);  // xmm0 as mask
1155     }
1156   } else { // opcode == Op_MaxV
1157     if (elem_bt == T_BYTE) {
1158       pmaxsb(dst, src);
1159     } else if (elem_bt == T_SHORT) {
1160       pmaxsw(dst, src);
1161     } else if (elem_bt == T_INT) {
1162       pmaxsd(dst, src);
1163     } else {
1164       assert(elem_bt == T_LONG, "required");
1165       assert(tmp == xmm0, "required");
1166       assert_different_registers(dst, src, tmp);
1167       movdqu(xmm0, src);
1168       pcmpgtq(xmm0, dst);
1169       blendvpd(dst, src);  // xmm0 as mask
1170     }
1171   }
1172 }
1173 
1174 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1175                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1176                                  int vlen_enc) {
1177   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1178 
1179   if (opcode == Op_MinV) {
1180     if (elem_bt == T_BYTE) {
1181       vpminsb(dst, src1, src2, vlen_enc);
1182     } else if (elem_bt == T_SHORT) {
1183       vpminsw(dst, src1, src2, vlen_enc);
1184     } else if (elem_bt == T_INT) {
1185       vpminsd(dst, src1, src2, vlen_enc);
1186     } else {
1187       assert(elem_bt == T_LONG, "required");
1188       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1189         vpminsq(dst, src1, src2, vlen_enc);
1190       } else {
1191         assert_different_registers(dst, src1, src2);
1192         vpcmpgtq(dst, src1, src2, vlen_enc);
1193         vblendvpd(dst, src1, src2, dst, vlen_enc);
1194       }
1195     }
1196   } else { // opcode == Op_MaxV
1197     if (elem_bt == T_BYTE) {
1198       vpmaxsb(dst, src1, src2, vlen_enc);
1199     } else if (elem_bt == T_SHORT) {
1200       vpmaxsw(dst, src1, src2, vlen_enc);
1201     } else if (elem_bt == T_INT) {
1202       vpmaxsd(dst, src1, src2, vlen_enc);
1203     } else {
1204       assert(elem_bt == T_LONG, "required");
1205       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1206         vpmaxsq(dst, src1, src2, vlen_enc);
1207       } else {
1208         assert_different_registers(dst, src1, src2);
1209         vpcmpgtq(dst, src1, src2, vlen_enc);
1210         vblendvpd(dst, src2, src1, dst, vlen_enc);
1211       }
1212     }
1213   }
1214 }
1215 
1216 // Float/Double min max
1217 
1218 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1219                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1220                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1221                                    int vlen_enc) {
1222   assert(UseAVX > 0, "required");
1223   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1224          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1225   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1226   assert_different_registers(a, b, tmp, atmp, btmp);
1227 
1228   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1229   bool is_double_word = is_double_word_type(elem_bt);
1230 
1231   if (!is_double_word && is_min) {
1232     vblendvps(atmp, a, b, a, vlen_enc);
1233     vblendvps(btmp, b, a, a, vlen_enc);
1234     vminps(tmp, atmp, btmp, vlen_enc);
1235     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1236     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
1237   } else if (!is_double_word && !is_min) {
1238     vblendvps(btmp, b, a, b, vlen_enc);
1239     vblendvps(atmp, a, b, b, vlen_enc);
1240     vmaxps(tmp, atmp, btmp, vlen_enc);
1241     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1242     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
1243   } else if (is_double_word && is_min) {
1244     vblendvpd(atmp, a, b, a, vlen_enc);
1245     vblendvpd(btmp, b, a, a, vlen_enc);
1246     vminpd(tmp, atmp, btmp, vlen_enc);
1247     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1248     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1249   } else {
1250     assert(is_double_word && !is_min, "sanity");
1251     vblendvpd(btmp, b, a, b, vlen_enc);
1252     vblendvpd(atmp, a, b, b, vlen_enc);
1253     vmaxpd(tmp, atmp, btmp, vlen_enc);
1254     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1255     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1256   }
1257 }
1258 
1259 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1260                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1261                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1262                                     int vlen_enc) {
1263   assert(UseAVX > 2, "required");
1264   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1265          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1266   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1267   assert_different_registers(dst, a, b, atmp, btmp);
1268 
1269   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1270   bool is_double_word = is_double_word_type(elem_bt);
1271   bool merge = true;
1272 
1273   if (!is_double_word && is_min) {
1274     evpmovd2m(ktmp, a, vlen_enc);
1275     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1276     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1277     vminps(dst, atmp, btmp, vlen_enc);
1278     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1279     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1280   } else if (!is_double_word && !is_min) {
1281     evpmovd2m(ktmp, b, vlen_enc);
1282     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1283     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1284     vmaxps(dst, atmp, btmp, vlen_enc);
1285     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1286     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1287   } else if (is_double_word && is_min) {
1288     evpmovq2m(ktmp, a, vlen_enc);
1289     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1290     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1291     vminpd(dst, atmp, btmp, vlen_enc);
1292     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1293     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1294   } else {
1295     assert(is_double_word && !is_min, "sanity");
1296     evpmovq2m(ktmp, b, vlen_enc);
1297     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1298     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1299     vmaxpd(dst, atmp, btmp, vlen_enc);
1300     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1301     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1302   }
1303 }
1304 
1305 // Float/Double signum
1306 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst,
1307                                   XMMRegister zero, XMMRegister one,
1308                                   Register scratch) {
1309   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1310 
1311   Label DONE_LABEL;
1312 
1313   if (opcode == Op_SignumF) {
1314     assert(UseSSE > 0, "required");
1315     ucomiss(dst, zero);
1316     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1317     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1318     movflt(dst, one);
1319     jcc(Assembler::above, DONE_LABEL);
1320     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scratch);
1321   } else if (opcode == Op_SignumD) {
1322     assert(UseSSE > 1, "required");
1323     ucomisd(dst, zero);
1324     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1325     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1326     movdbl(dst, one);
1327     jcc(Assembler::above, DONE_LABEL);
1328     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scratch);
1329   }
1330 
1331   bind(DONE_LABEL);
1332 }
1333 
1334 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1335   if (sign) {
1336     pmovsxbw(dst, src);
1337   } else {
1338     pmovzxbw(dst, src);
1339   }
1340 }
1341 
1342 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1343   if (sign) {
1344     vpmovsxbw(dst, src, vector_len);
1345   } else {
1346     vpmovzxbw(dst, src, vector_len);
1347   }
1348 }
1349 
1350 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1351   if (sign) {
1352     vpmovsxbd(dst, src, vector_len);
1353   } else {
1354     vpmovzxbd(dst, src, vector_len);
1355   }
1356 }
1357 
1358 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1359   if (sign) {
1360     vpmovsxwd(dst, src, vector_len);
1361   } else {
1362     vpmovzxwd(dst, src, vector_len);
1363   }
1364 }
1365 
1366 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1367                                      int shift, int vector_len) {
1368   if (opcode == Op_RotateLeftV) {
1369     if (etype == T_INT) {
1370       evprold(dst, src, shift, vector_len);
1371     } else {
1372       assert(etype == T_LONG, "expected type T_LONG");
1373       evprolq(dst, src, shift, vector_len);
1374     }
1375   } else {
1376     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1377     if (etype == T_INT) {
1378       evprord(dst, src, shift, vector_len);
1379     } else {
1380       assert(etype == T_LONG, "expected type T_LONG");
1381       evprorq(dst, src, shift, vector_len);
1382     }
1383   }
1384 }
1385 
1386 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1387                                      XMMRegister shift, int vector_len) {
1388   if (opcode == Op_RotateLeftV) {
1389     if (etype == T_INT) {
1390       evprolvd(dst, src, shift, vector_len);
1391     } else {
1392       assert(etype == T_LONG, "expected type T_LONG");
1393       evprolvq(dst, src, shift, vector_len);
1394     }
1395   } else {
1396     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1397     if (etype == T_INT) {
1398       evprorvd(dst, src, shift, vector_len);
1399     } else {
1400       assert(etype == T_LONG, "expected type T_LONG");
1401       evprorvq(dst, src, shift, vector_len);
1402     }
1403   }
1404 }
1405 
1406 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1407   if (opcode == Op_RShiftVI) {
1408     psrad(dst, shift);
1409   } else if (opcode == Op_LShiftVI) {
1410     pslld(dst, shift);
1411   } else {
1412     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1413     psrld(dst, shift);
1414   }
1415 }
1416 
1417 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1418   switch (opcode) {
1419     case Op_RShiftVI:  psrad(dst, shift); break;
1420     case Op_LShiftVI:  pslld(dst, shift); break;
1421     case Op_URShiftVI: psrld(dst, shift); break;
1422 
1423     default: assert(false, "%s", NodeClassNames[opcode]);
1424   }
1425 }
1426 
1427 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1428   if (opcode == Op_RShiftVI) {
1429     vpsrad(dst, nds, shift, vector_len);
1430   } else if (opcode == Op_LShiftVI) {
1431     vpslld(dst, nds, shift, vector_len);
1432   } else {
1433     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1434     vpsrld(dst, nds, shift, vector_len);
1435   }
1436 }
1437 
1438 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1439   switch (opcode) {
1440     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1441     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1442     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1443 
1444     default: assert(false, "%s", NodeClassNames[opcode]);
1445   }
1446 }
1447 
1448 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1449   switch (opcode) {
1450     case Op_RShiftVB:  // fall-through
1451     case Op_RShiftVS:  psraw(dst, shift); break;
1452 
1453     case Op_LShiftVB:  // fall-through
1454     case Op_LShiftVS:  psllw(dst, shift);   break;
1455 
1456     case Op_URShiftVS: // fall-through
1457     case Op_URShiftVB: psrlw(dst, shift);  break;
1458 
1459     default: assert(false, "%s", NodeClassNames[opcode]);
1460   }
1461 }
1462 
1463 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1464   switch (opcode) {
1465     case Op_RShiftVB:  // fall-through
1466     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1467 
1468     case Op_LShiftVB:  // fall-through
1469     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1470 
1471     case Op_URShiftVS: // fall-through
1472     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1473 
1474     default: assert(false, "%s", NodeClassNames[opcode]);
1475   }
1476 }
1477 
1478 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1479   switch (opcode) {
1480     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1481     case Op_LShiftVL:  psllq(dst, shift); break;
1482     case Op_URShiftVL: psrlq(dst, shift); break;
1483 
1484     default: assert(false, "%s", NodeClassNames[opcode]);
1485   }
1486 }
1487 
1488 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1489   if (opcode == Op_RShiftVL) {
1490     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1491   } else if (opcode == Op_LShiftVL) {
1492     psllq(dst, shift);
1493   } else {
1494     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1495     psrlq(dst, shift);
1496   }
1497 }
1498 
1499 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1500   switch (opcode) {
1501     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1502     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1503     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1504 
1505     default: assert(false, "%s", NodeClassNames[opcode]);
1506   }
1507 }
1508 
1509 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1510   if (opcode == Op_RShiftVL) {
1511     evpsraq(dst, nds, shift, vector_len);
1512   } else if (opcode == Op_LShiftVL) {
1513     vpsllq(dst, nds, shift, vector_len);
1514   } else {
1515     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1516     vpsrlq(dst, nds, shift, vector_len);
1517   }
1518 }
1519 
1520 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1521   switch (opcode) {
1522     case Op_RShiftVB:  // fall-through
1523     case Op_RShiftVS:  // fall-through
1524     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1525 
1526     case Op_LShiftVB:  // fall-through
1527     case Op_LShiftVS:  // fall-through
1528     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1529 
1530     case Op_URShiftVB: // fall-through
1531     case Op_URShiftVS: // fall-through
1532     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1533 
1534     default: assert(false, "%s", NodeClassNames[opcode]);
1535   }
1536 }
1537 
1538 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1539   switch (opcode) {
1540     case Op_RShiftVB:  // fall-through
1541     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1542 
1543     case Op_LShiftVB:  // fall-through
1544     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1545 
1546     case Op_URShiftVB: // fall-through
1547     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1548 
1549     default: assert(false, "%s", NodeClassNames[opcode]);
1550   }
1551 }
1552 
1553 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1554   assert(UseAVX >= 2, "required");
1555   switch (opcode) {
1556     case Op_RShiftVL: {
1557       if (UseAVX > 2) {
1558         assert(tmp == xnoreg, "not used");
1559         if (!VM_Version::supports_avx512vl()) {
1560           vlen_enc = Assembler::AVX_512bit;
1561         }
1562         evpsravq(dst, src, shift, vlen_enc);
1563       } else {
1564         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1565         vpsrlvq(dst, src, shift, vlen_enc);
1566         vpsrlvq(tmp, tmp, shift, vlen_enc);
1567         vpxor(dst, dst, tmp, vlen_enc);
1568         vpsubq(dst, dst, tmp, vlen_enc);
1569       }
1570       break;
1571     }
1572     case Op_LShiftVL: {
1573       assert(tmp == xnoreg, "not used");
1574       vpsllvq(dst, src, shift, vlen_enc);
1575       break;
1576     }
1577     case Op_URShiftVL: {
1578       assert(tmp == xnoreg, "not used");
1579       vpsrlvq(dst, src, shift, vlen_enc);
1580       break;
1581     }
1582     default: assert(false, "%s", NodeClassNames[opcode]);
1583   }
1584 }
1585 
1586 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1587 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1588   assert(opcode == Op_LShiftVB ||
1589          opcode == Op_RShiftVB ||
1590          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1591   bool sign = (opcode != Op_URShiftVB);
1592   assert(vector_len == 0, "required");
1593   vextendbd(sign, dst, src, 1);
1594   vpmovzxbd(vtmp, shift, 1);
1595   varshiftd(opcode, dst, dst, vtmp, 1);
1596   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch);
1597   vextracti128_high(vtmp, dst);
1598   vpackusdw(dst, dst, vtmp, 0);
1599 }
1600 
1601 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1602 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1603   assert(opcode == Op_LShiftVB ||
1604          opcode == Op_RShiftVB ||
1605          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1606   bool sign = (opcode != Op_URShiftVB);
1607   int ext_vector_len = vector_len + 1;
1608   vextendbw(sign, dst, src, ext_vector_len);
1609   vpmovzxbw(vtmp, shift, ext_vector_len);
1610   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1611   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch);
1612   if (vector_len == 0) {
1613     vextracti128_high(vtmp, dst);
1614     vpackuswb(dst, dst, vtmp, vector_len);
1615   } else {
1616     vextracti64x4_high(vtmp, dst);
1617     vpackuswb(dst, dst, vtmp, vector_len);
1618     vpermq(dst, dst, 0xD8, vector_len);
1619   }
1620 }
1621 
1622 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1623   switch(typ) {
1624     case T_BYTE:
1625       pinsrb(dst, val, idx);
1626       break;
1627     case T_SHORT:
1628       pinsrw(dst, val, idx);
1629       break;
1630     case T_INT:
1631       pinsrd(dst, val, idx);
1632       break;
1633     case T_LONG:
1634       pinsrq(dst, val, idx);
1635       break;
1636     default:
1637       assert(false,"Should not reach here.");
1638       break;
1639   }
1640 }
1641 
1642 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1643   switch(typ) {
1644     case T_BYTE:
1645       vpinsrb(dst, src, val, idx);
1646       break;
1647     case T_SHORT:
1648       vpinsrw(dst, src, val, idx);
1649       break;
1650     case T_INT:
1651       vpinsrd(dst, src, val, idx);
1652       break;
1653     case T_LONG:
1654       vpinsrq(dst, src, val, idx);
1655       break;
1656     default:
1657       assert(false,"Should not reach here.");
1658       break;
1659   }
1660 }
1661 
1662 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1663   switch(typ) {
1664     case T_INT:
1665       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1666       break;
1667     case T_FLOAT:
1668       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1669       break;
1670     case T_LONG:
1671       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1672       break;
1673     case T_DOUBLE:
1674       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1675       break;
1676     default:
1677       assert(false,"Should not reach here.");
1678       break;
1679   }
1680 }
1681 
1682 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1683   switch(typ) {
1684     case T_INT:
1685       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1686       break;
1687     case T_FLOAT:
1688       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1689       break;
1690     case T_LONG:
1691       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1692       break;
1693     case T_DOUBLE:
1694       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1695       break;
1696     default:
1697       assert(false,"Should not reach here.");
1698       break;
1699   }
1700 }
1701 
1702 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1703   switch(typ) {
1704     case T_INT:
1705       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1706       break;
1707     case T_FLOAT:
1708       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1709       break;
1710     case T_LONG:
1711       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1712       break;
1713     case T_DOUBLE:
1714       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1715       break;
1716     default:
1717       assert(false,"Should not reach here.");
1718       break;
1719   }
1720 }
1721 
1722 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1723   if (vlen_in_bytes <= 16) {
1724     pxor (dst, dst);
1725     psubb(dst, src);
1726     switch (elem_bt) {
1727       case T_BYTE:   /* nothing to do */ break;
1728       case T_SHORT:  pmovsxbw(dst, dst); break;
1729       case T_INT:    pmovsxbd(dst, dst); break;
1730       case T_FLOAT:  pmovsxbd(dst, dst); break;
1731       case T_LONG:   pmovsxbq(dst, dst); break;
1732       case T_DOUBLE: pmovsxbq(dst, dst); break;
1733 
1734       default: assert(false, "%s", type2name(elem_bt));
1735     }
1736   } else {
1737     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1738     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1739 
1740     vpxor (dst, dst, dst, vlen_enc);
1741     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1742 
1743     switch (elem_bt) {
1744       case T_BYTE:   /* nothing to do */            break;
1745       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1746       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1747       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1748       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1749       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1750 
1751       default: assert(false, "%s", type2name(elem_bt));
1752     }
1753   }
1754 }
1755 
1756 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) {
1757   ExternalAddress addr(StubRoutines::x86::vector_iota_indices());
1758   if (vlen_in_bytes == 4) {
1759     movdl(dst, addr);
1760   } else if (vlen_in_bytes == 8) {
1761     movq(dst, addr);
1762   } else if (vlen_in_bytes == 16) {
1763     movdqu(dst, addr, scratch);
1764   } else if (vlen_in_bytes == 32) {
1765     vmovdqu(dst, addr, scratch);
1766   } else {
1767     assert(vlen_in_bytes == 64, "%d", vlen_in_bytes);
1768     evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch);
1769   }
1770 }
1771 
1772 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1773 
1774 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1775   int vector_len = Assembler::AVX_128bit;
1776 
1777   switch (opcode) {
1778     case Op_AndReductionV:  pand(dst, src); break;
1779     case Op_OrReductionV:   por (dst, src); break;
1780     case Op_XorReductionV:  pxor(dst, src); break;
1781     case Op_MinReductionV:
1782       switch (typ) {
1783         case T_BYTE:        pminsb(dst, src); break;
1784         case T_SHORT:       pminsw(dst, src); break;
1785         case T_INT:         pminsd(dst, src); break;
1786         case T_LONG:        assert(UseAVX > 2, "required");
1787                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1788         default:            assert(false, "wrong type");
1789       }
1790       break;
1791     case Op_MaxReductionV:
1792       switch (typ) {
1793         case T_BYTE:        pmaxsb(dst, src); break;
1794         case T_SHORT:       pmaxsw(dst, src); break;
1795         case T_INT:         pmaxsd(dst, src); break;
1796         case T_LONG:        assert(UseAVX > 2, "required");
1797                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1798         default:            assert(false, "wrong type");
1799       }
1800       break;
1801     case Op_AddReductionVF: addss(dst, src); break;
1802     case Op_AddReductionVD: addsd(dst, src); break;
1803     case Op_AddReductionVI:
1804       switch (typ) {
1805         case T_BYTE:        paddb(dst, src); break;
1806         case T_SHORT:       paddw(dst, src); break;
1807         case T_INT:         paddd(dst, src); break;
1808         default:            assert(false, "wrong type");
1809       }
1810       break;
1811     case Op_AddReductionVL: paddq(dst, src); break;
1812     case Op_MulReductionVF: mulss(dst, src); break;
1813     case Op_MulReductionVD: mulsd(dst, src); break;
1814     case Op_MulReductionVI:
1815       switch (typ) {
1816         case T_SHORT:       pmullw(dst, src); break;
1817         case T_INT:         pmulld(dst, src); break;
1818         default:            assert(false, "wrong type");
1819       }
1820       break;
1821     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1822                             vpmullq(dst, dst, src, vector_len); break;
1823     default:                assert(false, "wrong opcode");
1824   }
1825 }
1826 
1827 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1828   int vector_len = Assembler::AVX_256bit;
1829 
1830   switch (opcode) {
1831     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1832     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1833     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1834     case Op_MinReductionV:
1835       switch (typ) {
1836         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1837         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1838         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1839         case T_LONG:        assert(UseAVX > 2, "required");
1840                             vpminsq(dst, src1, src2, vector_len); break;
1841         default:            assert(false, "wrong type");
1842       }
1843       break;
1844     case Op_MaxReductionV:
1845       switch (typ) {
1846         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1847         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1848         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1849         case T_LONG:        assert(UseAVX > 2, "required");
1850                             vpmaxsq(dst, src1, src2, vector_len); break;
1851         default:            assert(false, "wrong type");
1852       }
1853       break;
1854     case Op_AddReductionVI:
1855       switch (typ) {
1856         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1857         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1858         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1859         default:            assert(false, "wrong type");
1860       }
1861       break;
1862     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1863     case Op_MulReductionVI:
1864       switch (typ) {
1865         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1866         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1867         default:            assert(false, "wrong type");
1868       }
1869       break;
1870     case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
1871     default:                assert(false, "wrong opcode");
1872   }
1873 }
1874 
1875 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1876                                   XMMRegister dst, XMMRegister src,
1877                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1878   switch (opcode) {
1879     case Op_AddReductionVF:
1880     case Op_MulReductionVF:
1881       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1882       break;
1883 
1884     case Op_AddReductionVD:
1885     case Op_MulReductionVD:
1886       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1887       break;
1888 
1889     default: assert(false, "wrong opcode");
1890   }
1891 }
1892 
1893 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1894                              Register dst, Register src1, XMMRegister src2,
1895                              XMMRegister vtmp1, XMMRegister vtmp2) {
1896   switch (vlen) {
1897     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1898     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1899     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1900     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1901 
1902     default: assert(false, "wrong vector length");
1903   }
1904 }
1905 
1906 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1907                              Register dst, Register src1, XMMRegister src2,
1908                              XMMRegister vtmp1, XMMRegister vtmp2) {
1909   switch (vlen) {
1910     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1911     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1912     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1913     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1914 
1915     default: assert(false, "wrong vector length");
1916   }
1917 }
1918 
1919 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1920                              Register dst, Register src1, XMMRegister src2,
1921                              XMMRegister vtmp1, XMMRegister vtmp2) {
1922   switch (vlen) {
1923     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1924     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1925     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1926     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1927 
1928     default: assert(false, "wrong vector length");
1929   }
1930 }
1931 
1932 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1933                              Register dst, Register src1, XMMRegister src2,
1934                              XMMRegister vtmp1, XMMRegister vtmp2) {
1935   switch (vlen) {
1936     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1937     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1938     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1939     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1940 
1941     default: assert(false, "wrong vector length");
1942   }
1943 }
1944 
1945 #ifdef _LP64
1946 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1947                              Register dst, Register src1, XMMRegister src2,
1948                              XMMRegister vtmp1, XMMRegister vtmp2) {
1949   switch (vlen) {
1950     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1951     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1952     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1953 
1954     default: assert(false, "wrong vector length");
1955   }
1956 }
1957 #endif // _LP64
1958 
1959 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1960   switch (vlen) {
1961     case 2:
1962       assert(vtmp2 == xnoreg, "");
1963       reduce2F(opcode, dst, src, vtmp1);
1964       break;
1965     case 4:
1966       assert(vtmp2 == xnoreg, "");
1967       reduce4F(opcode, dst, src, vtmp1);
1968       break;
1969     case 8:
1970       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1971       break;
1972     case 16:
1973       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1974       break;
1975     default: assert(false, "wrong vector length");
1976   }
1977 }
1978 
1979 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1980   switch (vlen) {
1981     case 2:
1982       assert(vtmp2 == xnoreg, "");
1983       reduce2D(opcode, dst, src, vtmp1);
1984       break;
1985     case 4:
1986       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1987       break;
1988     case 8:
1989       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1990       break;
1991     default: assert(false, "wrong vector length");
1992   }
1993 }
1994 
1995 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1996   if (opcode == Op_AddReductionVI) {
1997     if (vtmp1 != src2) {
1998       movdqu(vtmp1, src2);
1999     }
2000     phaddd(vtmp1, vtmp1);
2001   } else {
2002     pshufd(vtmp1, src2, 0x1);
2003     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2004   }
2005   movdl(vtmp2, src1);
2006   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2007   movdl(dst, vtmp1);
2008 }
2009 
2010 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2011   if (opcode == Op_AddReductionVI) {
2012     if (vtmp1 != src2) {
2013       movdqu(vtmp1, src2);
2014     }
2015     phaddd(vtmp1, src2);
2016     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2017   } else {
2018     pshufd(vtmp2, src2, 0xE);
2019     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2020     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2021   }
2022 }
2023 
2024 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2025   if (opcode == Op_AddReductionVI) {
2026     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2027     vextracti128_high(vtmp2, vtmp1);
2028     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2029     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2030   } else {
2031     vextracti128_high(vtmp1, src2);
2032     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2033     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2034   }
2035 }
2036 
2037 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2038   vextracti64x4_high(vtmp2, src2);
2039   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2040   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2041 }
2042 
2043 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2044   pshufd(vtmp2, src2, 0x1);
2045   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2046   movdqu(vtmp1, vtmp2);
2047   psrldq(vtmp1, 2);
2048   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2049   movdqu(vtmp2, vtmp1);
2050   psrldq(vtmp2, 1);
2051   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2052   movdl(vtmp2, src1);
2053   pmovsxbd(vtmp1, vtmp1);
2054   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2055   pextrb(dst, vtmp1, 0x0);
2056   movsbl(dst, dst);
2057 }
2058 
2059 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2060   pshufd(vtmp1, src2, 0xE);
2061   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2062   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2063 }
2064 
2065 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2066   vextracti128_high(vtmp2, src2);
2067   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2068   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2069 }
2070 
2071 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2072   vextracti64x4_high(vtmp1, src2);
2073   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2074   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2075 }
2076 
2077 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2078   pmovsxbw(vtmp2, src2);
2079   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2080 }
2081 
2082 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2083   if (UseAVX > 1) {
2084     int vector_len = Assembler::AVX_256bit;
2085     vpmovsxbw(vtmp1, src2, vector_len);
2086     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2087   } else {
2088     pmovsxbw(vtmp2, src2);
2089     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2090     pshufd(vtmp2, src2, 0x1);
2091     pmovsxbw(vtmp2, src2);
2092     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2093   }
2094 }
2095 
2096 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2097   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2098     int vector_len = Assembler::AVX_512bit;
2099     vpmovsxbw(vtmp1, src2, vector_len);
2100     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2101   } else {
2102     assert(UseAVX >= 2,"Should not reach here.");
2103     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2104     vextracti128_high(vtmp2, src2);
2105     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2106   }
2107 }
2108 
2109 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2110   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2111   vextracti64x4_high(vtmp2, src2);
2112   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2113 }
2114 
2115 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2116   if (opcode == Op_AddReductionVI) {
2117     if (vtmp1 != src2) {
2118       movdqu(vtmp1, src2);
2119     }
2120     phaddw(vtmp1, vtmp1);
2121     phaddw(vtmp1, vtmp1);
2122   } else {
2123     pshufd(vtmp2, src2, 0x1);
2124     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2125     movdqu(vtmp1, vtmp2);
2126     psrldq(vtmp1, 2);
2127     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2128   }
2129   movdl(vtmp2, src1);
2130   pmovsxwd(vtmp1, vtmp1);
2131   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2132   pextrw(dst, vtmp1, 0x0);
2133   movswl(dst, dst);
2134 }
2135 
2136 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2137   if (opcode == Op_AddReductionVI) {
2138     if (vtmp1 != src2) {
2139       movdqu(vtmp1, src2);
2140     }
2141     phaddw(vtmp1, src2);
2142   } else {
2143     pshufd(vtmp1, src2, 0xE);
2144     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2145   }
2146   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2147 }
2148 
2149 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2150   if (opcode == Op_AddReductionVI) {
2151     int vector_len = Assembler::AVX_256bit;
2152     vphaddw(vtmp2, src2, src2, vector_len);
2153     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2154   } else {
2155     vextracti128_high(vtmp2, src2);
2156     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2157   }
2158   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2159 }
2160 
2161 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2162   int vector_len = Assembler::AVX_256bit;
2163   vextracti64x4_high(vtmp1, src2);
2164   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2165   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2166 }
2167 
2168 #ifdef _LP64
2169 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2170   pshufd(vtmp2, src2, 0xE);
2171   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2172   movdq(vtmp1, src1);
2173   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2174   movdq(dst, vtmp1);
2175 }
2176 
2177 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2178   vextracti128_high(vtmp1, src2);
2179   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2180   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2181 }
2182 
2183 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2184   vextracti64x4_high(vtmp2, src2);
2185   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2186   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2187 }
2188 
2189 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2190   assert(ArrayOperationPartialInlineSize > 0 && ArrayOperationPartialInlineSize <= 64, "invalid");
2191   mov64(temp, -1L);
2192   bzhiq(temp, temp, len);
2193   kmovql(dst, temp);
2194 }
2195 #endif // _LP64
2196 
2197 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2198   reduce_operation_128(T_FLOAT, opcode, dst, src);
2199   pshufd(vtmp, src, 0x1);
2200   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2201 }
2202 
2203 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2204   reduce2F(opcode, dst, src, vtmp);
2205   pshufd(vtmp, src, 0x2);
2206   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2207   pshufd(vtmp, src, 0x3);
2208   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2209 }
2210 
2211 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2212   reduce4F(opcode, dst, src, vtmp2);
2213   vextractf128_high(vtmp2, src);
2214   reduce4F(opcode, dst, vtmp2, vtmp1);
2215 }
2216 
2217 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2218   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2219   vextracti64x4_high(vtmp1, src);
2220   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2221 }
2222 
2223 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2224   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2225   pshufd(vtmp, src, 0xE);
2226   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2227 }
2228 
2229 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2230   reduce2D(opcode, dst, src, vtmp2);
2231   vextractf128_high(vtmp2, src);
2232   reduce2D(opcode, dst, vtmp2, vtmp1);
2233 }
2234 
2235 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2236   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2237   vextracti64x4_high(vtmp1, src);
2238   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2239 }
2240 
2241 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) {
2242   MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
2243 }
2244 
2245 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) {
2246   MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
2247 }
2248 
2249 
2250 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2251                                           XMMRegister dst, XMMRegister src,
2252                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2253                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2254   int permconst[] = {1, 14};
2255   XMMRegister wsrc = src;
2256   XMMRegister wdst = xmm_0;
2257   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2258 
2259   int vlen_enc = Assembler::AVX_128bit;
2260   if (vlen == 16) {
2261     vlen_enc = Assembler::AVX_256bit;
2262   }
2263 
2264   for (int i = log2(vlen) - 1; i >=0; i--) {
2265     if (i == 0 && !is_dst_valid) {
2266       wdst = dst;
2267     }
2268     if (i == 3) {
2269       vextracti64x4_high(wtmp, wsrc);
2270     } else if (i == 2) {
2271       vextracti128_high(wtmp, wsrc);
2272     } else { // i = [0,1]
2273       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2274     }
2275     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2276     wsrc = wdst;
2277     vlen_enc = Assembler::AVX_128bit;
2278   }
2279   if (is_dst_valid) {
2280     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2281   }
2282 }
2283 
2284 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2285                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2286                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2287   XMMRegister wsrc = src;
2288   XMMRegister wdst = xmm_0;
2289   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2290   int vlen_enc = Assembler::AVX_128bit;
2291   if (vlen == 8) {
2292     vlen_enc = Assembler::AVX_256bit;
2293   }
2294   for (int i = log2(vlen) - 1; i >=0; i--) {
2295     if (i == 0 && !is_dst_valid) {
2296       wdst = dst;
2297     }
2298     if (i == 1) {
2299       vextracti128_high(wtmp, wsrc);
2300     } else if (i == 2) {
2301       vextracti64x4_high(wtmp, wsrc);
2302     } else {
2303       assert(i == 0, "%d", i);
2304       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2305     }
2306     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2307     wsrc = wdst;
2308     vlen_enc = Assembler::AVX_128bit;
2309   }
2310   if (is_dst_valid) {
2311     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2312   }
2313 }
2314 
2315 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2316   switch (bt) {
2317     case T_BYTE:  pextrb(dst, src, idx); break;
2318     case T_SHORT: pextrw(dst, src, idx); break;
2319     case T_INT:   pextrd(dst, src, idx); break;
2320     case T_LONG:  pextrq(dst, src, idx); break;
2321 
2322     default:
2323       assert(false,"Should not reach here.");
2324       break;
2325   }
2326 }
2327 
2328 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2329   int esize =  type2aelembytes(typ);
2330   int elem_per_lane = 16/esize;
2331   int lane = elemindex / elem_per_lane;
2332   int eindex = elemindex % elem_per_lane;
2333 
2334   if (lane >= 2) {
2335     assert(UseAVX > 2, "required");
2336     vextractf32x4(dst, src, lane & 3);
2337     return dst;
2338   } else if (lane > 0) {
2339     assert(UseAVX > 0, "required");
2340     vextractf128(dst, src, lane);
2341     return dst;
2342   } else {
2343     return src;
2344   }
2345 }
2346 
2347 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2348   if (typ == T_BYTE) {
2349     movsbl(dst, dst);
2350   } else if (typ == T_SHORT) {
2351     movswl(dst, dst);
2352   }
2353 }
2354 
2355 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2356   int esize =  type2aelembytes(typ);
2357   int elem_per_lane = 16/esize;
2358   int eindex = elemindex % elem_per_lane;
2359   assert(is_integral_type(typ),"required");
2360 
2361   if (eindex == 0) {
2362     if (typ == T_LONG) {
2363       movq(dst, src);
2364     } else {
2365       movdl(dst, src);
2366       movsxl(typ, dst);
2367     }
2368   } else {
2369     extract(typ, dst, src, eindex);
2370     movsxl(typ, dst);
2371   }
2372 }
2373 
2374 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) {
2375   int esize =  type2aelembytes(typ);
2376   int elem_per_lane = 16/esize;
2377   int eindex = elemindex % elem_per_lane;
2378   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2379 
2380   if (eindex == 0) {
2381     movq(dst, src);
2382   } else {
2383     if (typ == T_FLOAT) {
2384       if (UseAVX == 0) {
2385         movdqu(dst, src);
2386         pshufps(dst, dst, eindex);
2387       } else {
2388         vpshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2389       }
2390     } else {
2391       if (UseAVX == 0) {
2392         movdqu(dst, src);
2393         psrldq(dst, eindex*esize);
2394       } else {
2395         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2396       }
2397       movq(dst, dst);
2398     }
2399   }
2400   // Zero upper bits
2401   if (typ == T_FLOAT) {
2402     if (UseAVX == 0) {
2403       assert((vtmp != xnoreg) && (tmp != noreg), "required.");
2404       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp);
2405       pand(dst, vtmp);
2406     } else {
2407       assert((tmp != noreg), "required.");
2408       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp);
2409     }
2410   }
2411 }
2412 
2413 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2414   switch(typ) {
2415     case T_BYTE:
2416     case T_BOOLEAN:
2417       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2418       break;
2419     case T_SHORT:
2420     case T_CHAR:
2421       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2422       break;
2423     case T_INT:
2424     case T_FLOAT:
2425       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2426       break;
2427     case T_LONG:
2428     case T_DOUBLE:
2429       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2430       break;
2431     default:
2432       assert(false,"Should not reach here.");
2433       break;
2434   }
2435 }
2436 
2437 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) {
2438   switch(typ) {
2439     case T_BOOLEAN:
2440     case T_BYTE:
2441       evpcmpb(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2442       break;
2443     case T_CHAR:
2444     case T_SHORT:
2445       evpcmpw(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2446       break;
2447     case T_INT:
2448     case T_FLOAT:
2449       evpcmpd(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2450       break;
2451     case T_LONG:
2452     case T_DOUBLE:
2453       evpcmpq(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2454       break;
2455     default:
2456       assert(false,"Should not reach here.");
2457       break;
2458   }
2459 }
2460 
2461 void C2_MacroAssembler::vpcmpu(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison,
2462                             int vlen_in_bytes, XMMRegister vtmp1, XMMRegister vtmp2, Register scratch) {
2463   int vlen_enc = vector_length_encoding(vlen_in_bytes*2);
2464   switch (typ) {
2465   case T_BYTE:
2466     vpmovzxbw(vtmp1, src1, vlen_enc);
2467     vpmovzxbw(vtmp2, src2, vlen_enc);
2468     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2469     vpacksswb(dst, dst, dst, vlen_enc);
2470     break;
2471   case T_SHORT:
2472     vpmovzxwd(vtmp1, src1, vlen_enc);
2473     vpmovzxwd(vtmp2, src2, vlen_enc);
2474     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);
2475     vpackssdw(dst, dst, dst, vlen_enc);
2476     break;
2477   case T_INT:
2478     vpmovzxdq(vtmp1, src1, vlen_enc);
2479     vpmovzxdq(vtmp2, src2, vlen_enc);
2480     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);
2481     vpermilps(dst, dst, 8, vlen_enc);
2482     break;
2483   default:
2484     assert(false, "Should not reach here");
2485   }
2486   if (vlen_in_bytes == 16) {
2487     vpermpd(dst, dst, 0x8, vlen_enc);
2488   }
2489 }
2490 
2491 void C2_MacroAssembler::vpcmpu32(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison, int vlen_in_bytes,
2492                               XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, Register scratch) {
2493   int vlen_enc = vector_length_encoding(vlen_in_bytes);
2494   switch (typ) {
2495   case T_BYTE:
2496     vpmovzxbw(vtmp1, src1, vlen_enc);
2497     vpmovzxbw(vtmp2, src2, vlen_enc);
2498     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2499     vextracti128(vtmp1, src1, 1);
2500     vextracti128(vtmp2, src2, 1);
2501     vpmovzxbw(vtmp1, vtmp1, vlen_enc);
2502     vpmovzxbw(vtmp2, vtmp2, vlen_enc);
2503     vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2504     vpacksswb(dst, dst, vtmp3, vlen_enc);
2505     vpermpd(dst, dst, 0xd8, vlen_enc);
2506     break;
2507   case T_SHORT:
2508     vpmovzxwd(vtmp1, src1, vlen_enc);
2509     vpmovzxwd(vtmp2, src2, vlen_enc);
2510     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);
2511     vextracti128(vtmp1, src1, 1);
2512     vextracti128(vtmp2, src2, 1);
2513     vpmovzxwd(vtmp1, vtmp1, vlen_enc);
2514     vpmovzxwd(vtmp2, vtmp2, vlen_enc);
2515     vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::D,  vlen_enc, scratch);
2516     vpackssdw(dst, dst, vtmp3, vlen_enc);
2517     vpermpd(dst, dst, 0xd8, vlen_enc);
2518     break;
2519   case T_INT:
2520     vpmovzxdq(vtmp1, src1, vlen_enc);
2521     vpmovzxdq(vtmp2, src2, vlen_enc);
2522     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);
2523     vpshufd(dst, dst, 8, vlen_enc);
2524     vpermq(dst, dst, 8, vlen_enc);
2525     vextracti128(vtmp1, src1, 1);
2526     vextracti128(vtmp2, src2, 1);
2527     vpmovzxdq(vtmp1, vtmp1, vlen_enc);
2528     vpmovzxdq(vtmp2, vtmp2, vlen_enc);
2529     vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::Q,  vlen_enc, scratch);
2530     vpshufd(vtmp3, vtmp3, 8, vlen_enc);
2531     vpermq(vtmp3, vtmp3, 0x80, vlen_enc);
2532     vpblendd(dst, dst, vtmp3, 0xf0, vlen_enc);
2533     break;
2534   default:
2535     assert(false, "Should not reach here");
2536   }
2537 }
2538 
2539 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2540   switch(typ) {
2541     case T_BYTE:
2542       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2543       break;
2544     case T_SHORT:
2545       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2546       break;
2547     case T_INT:
2548     case T_FLOAT:
2549       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2550       break;
2551     case T_LONG:
2552     case T_DOUBLE:
2553       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2554       break;
2555     default:
2556       assert(false,"Should not reach here.");
2557       break;
2558   }
2559 }
2560 
2561 void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2,
2562                                    XMMRegister vtmp1, XMMRegister vtmp2, KRegister mask) {
2563   switch(vlen) {
2564     case 4:
2565       assert(vtmp1 != xnoreg, "required.");
2566       // Broadcast lower 32 bits to 128 bits before ptest
2567       pshufd(vtmp1, src1, 0x0);
2568       if (bt == BoolTest::overflow) {
2569         assert(vtmp2 != xnoreg, "required.");
2570         pshufd(vtmp2, src2, 0x0);
2571       } else {
2572         assert(vtmp2 == xnoreg, "required.");
2573         vtmp2 = src2;
2574       }
2575       ptest(vtmp1, vtmp2);
2576      break;
2577     case 8:
2578       assert(vtmp1 != xnoreg, "required.");
2579       // Broadcast lower 64 bits to 128 bits before ptest
2580       pshufd(vtmp1, src1, 0x4);
2581       if (bt == BoolTest::overflow) {
2582         assert(vtmp2 != xnoreg, "required.");
2583         pshufd(vtmp2, src2, 0x4);
2584       } else {
2585         assert(vtmp2 == xnoreg, "required.");
2586         vtmp2 = src2;
2587       }
2588       ptest(vtmp1, vtmp2);
2589      break;
2590     case 16:
2591       assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2592       ptest(src1, src2);
2593       break;
2594     case 32:
2595       assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2596       vptest(src1, src2, Assembler::AVX_256bit);
2597       break;
2598     case 64:
2599       {
2600         assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2601         evpcmpeqb(mask, src1, src2, Assembler::AVX_512bit);
2602         if (bt == BoolTest::ne) {
2603           ktestql(mask, mask);
2604         } else {
2605           assert(bt == BoolTest::overflow, "required");
2606           kortestql(mask, mask);
2607         }
2608       }
2609       break;
2610     default:
2611       assert(false,"Should not reach here.");
2612       break;
2613   }
2614 }
2615 
2616 //-------------------------------------------------------------------------------------------
2617 
2618 // IndexOf for constant substrings with size >= 8 chars
2619 // which don't need to be loaded through stack.
2620 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2621                                          Register cnt1, Register cnt2,
2622                                          int int_cnt2,  Register result,
2623                                          XMMRegister vec, Register tmp,
2624                                          int ae) {
2625   ShortBranchVerifier sbv(this);
2626   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2627   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2628 
2629   // This method uses the pcmpestri instruction with bound registers
2630   //   inputs:
2631   //     xmm - substring
2632   //     rax - substring length (elements count)
2633   //     mem - scanned string
2634   //     rdx - string length (elements count)
2635   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2636   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2637   //   outputs:
2638   //     rcx - matched index in string
2639   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2640   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2641   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2642   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2643   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2644 
2645   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2646         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2647         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2648 
2649   // Note, inline_string_indexOf() generates checks:
2650   // if (substr.count > string.count) return -1;
2651   // if (substr.count == 0) return 0;
2652   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2653 
2654   // Load substring.
2655   if (ae == StrIntrinsicNode::UL) {
2656     pmovzxbw(vec, Address(str2, 0));
2657   } else {
2658     movdqu(vec, Address(str2, 0));
2659   }
2660   movl(cnt2, int_cnt2);
2661   movptr(result, str1); // string addr
2662 
2663   if (int_cnt2 > stride) {
2664     jmpb(SCAN_TO_SUBSTR);
2665 
2666     // Reload substr for rescan, this code
2667     // is executed only for large substrings (> 8 chars)
2668     bind(RELOAD_SUBSTR);
2669     if (ae == StrIntrinsicNode::UL) {
2670       pmovzxbw(vec, Address(str2, 0));
2671     } else {
2672       movdqu(vec, Address(str2, 0));
2673     }
2674     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2675 
2676     bind(RELOAD_STR);
2677     // We came here after the beginning of the substring was
2678     // matched but the rest of it was not so we need to search
2679     // again. Start from the next element after the previous match.
2680 
2681     // cnt2 is number of substring reminding elements and
2682     // cnt1 is number of string reminding elements when cmp failed.
2683     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2684     subl(cnt1, cnt2);
2685     addl(cnt1, int_cnt2);
2686     movl(cnt2, int_cnt2); // Now restore cnt2
2687 
2688     decrementl(cnt1);     // Shift to next element
2689     cmpl(cnt1, cnt2);
2690     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2691 
2692     addptr(result, (1<<scale1));
2693 
2694   } // (int_cnt2 > 8)
2695 
2696   // Scan string for start of substr in 16-byte vectors
2697   bind(SCAN_TO_SUBSTR);
2698   pcmpestri(vec, Address(result, 0), mode);
2699   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2700   subl(cnt1, stride);
2701   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2702   cmpl(cnt1, cnt2);
2703   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2704   addptr(result, 16);
2705   jmpb(SCAN_TO_SUBSTR);
2706 
2707   // Found a potential substr
2708   bind(FOUND_CANDIDATE);
2709   // Matched whole vector if first element matched (tmp(rcx) == 0).
2710   if (int_cnt2 == stride) {
2711     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2712   } else { // int_cnt2 > 8
2713     jccb(Assembler::overflow, FOUND_SUBSTR);
2714   }
2715   // After pcmpestri tmp(rcx) contains matched element index
2716   // Compute start addr of substr
2717   lea(result, Address(result, tmp, scale1));
2718 
2719   // Make sure string is still long enough
2720   subl(cnt1, tmp);
2721   cmpl(cnt1, cnt2);
2722   if (int_cnt2 == stride) {
2723     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2724   } else { // int_cnt2 > 8
2725     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2726   }
2727   // Left less then substring.
2728 
2729   bind(RET_NOT_FOUND);
2730   movl(result, -1);
2731   jmp(EXIT);
2732 
2733   if (int_cnt2 > stride) {
2734     // This code is optimized for the case when whole substring
2735     // is matched if its head is matched.
2736     bind(MATCH_SUBSTR_HEAD);
2737     pcmpestri(vec, Address(result, 0), mode);
2738     // Reload only string if does not match
2739     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2740 
2741     Label CONT_SCAN_SUBSTR;
2742     // Compare the rest of substring (> 8 chars).
2743     bind(FOUND_SUBSTR);
2744     // First 8 chars are already matched.
2745     negptr(cnt2);
2746     addptr(cnt2, stride);
2747 
2748     bind(SCAN_SUBSTR);
2749     subl(cnt1, stride);
2750     cmpl(cnt2, -stride); // Do not read beyond substring
2751     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2752     // Back-up strings to avoid reading beyond substring:
2753     // cnt1 = cnt1 - cnt2 + 8
2754     addl(cnt1, cnt2); // cnt2 is negative
2755     addl(cnt1, stride);
2756     movl(cnt2, stride); negptr(cnt2);
2757     bind(CONT_SCAN_SUBSTR);
2758     if (int_cnt2 < (int)G) {
2759       int tail_off1 = int_cnt2<<scale1;
2760       int tail_off2 = int_cnt2<<scale2;
2761       if (ae == StrIntrinsicNode::UL) {
2762         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2763       } else {
2764         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2765       }
2766       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2767     } else {
2768       // calculate index in register to avoid integer overflow (int_cnt2*2)
2769       movl(tmp, int_cnt2);
2770       addptr(tmp, cnt2);
2771       if (ae == StrIntrinsicNode::UL) {
2772         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2773       } else {
2774         movdqu(vec, Address(str2, tmp, scale2, 0));
2775       }
2776       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2777     }
2778     // Need to reload strings pointers if not matched whole vector
2779     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2780     addptr(cnt2, stride);
2781     jcc(Assembler::negative, SCAN_SUBSTR);
2782     // Fall through if found full substring
2783 
2784   } // (int_cnt2 > 8)
2785 
2786   bind(RET_FOUND);
2787   // Found result if we matched full small substring.
2788   // Compute substr offset
2789   subptr(result, str1);
2790   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2791     shrl(result, 1); // index
2792   }
2793   bind(EXIT);
2794 
2795 } // string_indexofC8
2796 
2797 // Small strings are loaded through stack if they cross page boundary.
2798 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2799                                        Register cnt1, Register cnt2,
2800                                        int int_cnt2,  Register result,
2801                                        XMMRegister vec, Register tmp,
2802                                        int ae) {
2803   ShortBranchVerifier sbv(this);
2804   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2805   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2806 
2807   //
2808   // int_cnt2 is length of small (< 8 chars) constant substring
2809   // or (-1) for non constant substring in which case its length
2810   // is in cnt2 register.
2811   //
2812   // Note, inline_string_indexOf() generates checks:
2813   // if (substr.count > string.count) return -1;
2814   // if (substr.count == 0) return 0;
2815   //
2816   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2817   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2818   // This method uses the pcmpestri instruction with bound registers
2819   //   inputs:
2820   //     xmm - substring
2821   //     rax - substring length (elements count)
2822   //     mem - scanned string
2823   //     rdx - string length (elements count)
2824   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2825   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2826   //   outputs:
2827   //     rcx - matched index in string
2828   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2829   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2830   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2831   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2832 
2833   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2834         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2835         FOUND_CANDIDATE;
2836 
2837   { //========================================================
2838     // We don't know where these strings are located
2839     // and we can't read beyond them. Load them through stack.
2840     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2841 
2842     movptr(tmp, rsp); // save old SP
2843 
2844     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2845       if (int_cnt2 == (1>>scale2)) { // One byte
2846         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2847         load_unsigned_byte(result, Address(str2, 0));
2848         movdl(vec, result); // move 32 bits
2849       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2850         // Not enough header space in 32-bit VM: 12+3 = 15.
2851         movl(result, Address(str2, -1));
2852         shrl(result, 8);
2853         movdl(vec, result); // move 32 bits
2854       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2855         load_unsigned_short(result, Address(str2, 0));
2856         movdl(vec, result); // move 32 bits
2857       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2858         movdl(vec, Address(str2, 0)); // move 32 bits
2859       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2860         movq(vec, Address(str2, 0));  // move 64 bits
2861       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2862         // Array header size is 12 bytes in 32-bit VM
2863         // + 6 bytes for 3 chars == 18 bytes,
2864         // enough space to load vec and shift.
2865         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2866         if (ae == StrIntrinsicNode::UL) {
2867           int tail_off = int_cnt2-8;
2868           pmovzxbw(vec, Address(str2, tail_off));
2869           psrldq(vec, -2*tail_off);
2870         }
2871         else {
2872           int tail_off = int_cnt2*(1<<scale2);
2873           movdqu(vec, Address(str2, tail_off-16));
2874           psrldq(vec, 16-tail_off);
2875         }
2876       }
2877     } else { // not constant substring
2878       cmpl(cnt2, stride);
2879       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2880 
2881       // We can read beyond string if srt+16 does not cross page boundary
2882       // since heaps are aligned and mapped by pages.
2883       assert(os::vm_page_size() < (int)G, "default page should be small");
2884       movl(result, str2); // We need only low 32 bits
2885       andl(result, (os::vm_page_size()-1));
2886       cmpl(result, (os::vm_page_size()-16));
2887       jccb(Assembler::belowEqual, CHECK_STR);
2888 
2889       // Move small strings to stack to allow load 16 bytes into vec.
2890       subptr(rsp, 16);
2891       int stk_offset = wordSize-(1<<scale2);
2892       push(cnt2);
2893 
2894       bind(COPY_SUBSTR);
2895       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2896         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2897         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2898       } else if (ae == StrIntrinsicNode::UU) {
2899         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2900         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2901       }
2902       decrement(cnt2);
2903       jccb(Assembler::notZero, COPY_SUBSTR);
2904 
2905       pop(cnt2);
2906       movptr(str2, rsp);  // New substring address
2907     } // non constant
2908 
2909     bind(CHECK_STR);
2910     cmpl(cnt1, stride);
2911     jccb(Assembler::aboveEqual, BIG_STRINGS);
2912 
2913     // Check cross page boundary.
2914     movl(result, str1); // We need only low 32 bits
2915     andl(result, (os::vm_page_size()-1));
2916     cmpl(result, (os::vm_page_size()-16));
2917     jccb(Assembler::belowEqual, BIG_STRINGS);
2918 
2919     subptr(rsp, 16);
2920     int stk_offset = -(1<<scale1);
2921     if (int_cnt2 < 0) { // not constant
2922       push(cnt2);
2923       stk_offset += wordSize;
2924     }
2925     movl(cnt2, cnt1);
2926 
2927     bind(COPY_STR);
2928     if (ae == StrIntrinsicNode::LL) {
2929       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2930       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2931     } else {
2932       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2933       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2934     }
2935     decrement(cnt2);
2936     jccb(Assembler::notZero, COPY_STR);
2937 
2938     if (int_cnt2 < 0) { // not constant
2939       pop(cnt2);
2940     }
2941     movptr(str1, rsp);  // New string address
2942 
2943     bind(BIG_STRINGS);
2944     // Load substring.
2945     if (int_cnt2 < 0) { // -1
2946       if (ae == StrIntrinsicNode::UL) {
2947         pmovzxbw(vec, Address(str2, 0));
2948       } else {
2949         movdqu(vec, Address(str2, 0));
2950       }
2951       push(cnt2);       // substr count
2952       push(str2);       // substr addr
2953       push(str1);       // string addr
2954     } else {
2955       // Small (< 8 chars) constant substrings are loaded already.
2956       movl(cnt2, int_cnt2);
2957     }
2958     push(tmp);  // original SP
2959 
2960   } // Finished loading
2961 
2962   //========================================================
2963   // Start search
2964   //
2965 
2966   movptr(result, str1); // string addr
2967 
2968   if (int_cnt2  < 0) {  // Only for non constant substring
2969     jmpb(SCAN_TO_SUBSTR);
2970 
2971     // SP saved at sp+0
2972     // String saved at sp+1*wordSize
2973     // Substr saved at sp+2*wordSize
2974     // Substr count saved at sp+3*wordSize
2975 
2976     // Reload substr for rescan, this code
2977     // is executed only for large substrings (> 8 chars)
2978     bind(RELOAD_SUBSTR);
2979     movptr(str2, Address(rsp, 2*wordSize));
2980     movl(cnt2, Address(rsp, 3*wordSize));
2981     if (ae == StrIntrinsicNode::UL) {
2982       pmovzxbw(vec, Address(str2, 0));
2983     } else {
2984       movdqu(vec, Address(str2, 0));
2985     }
2986     // We came here after the beginning of the substring was
2987     // matched but the rest of it was not so we need to search
2988     // again. Start from the next element after the previous match.
2989     subptr(str1, result); // Restore counter
2990     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2991       shrl(str1, 1);
2992     }
2993     addl(cnt1, str1);
2994     decrementl(cnt1);   // Shift to next element
2995     cmpl(cnt1, cnt2);
2996     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2997 
2998     addptr(result, (1<<scale1));
2999   } // non constant
3000 
3001   // Scan string for start of substr in 16-byte vectors
3002   bind(SCAN_TO_SUBSTR);
3003   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3004   pcmpestri(vec, Address(result, 0), mode);
3005   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3006   subl(cnt1, stride);
3007   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3008   cmpl(cnt1, cnt2);
3009   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3010   addptr(result, 16);
3011 
3012   bind(ADJUST_STR);
3013   cmpl(cnt1, stride); // Do not read beyond string
3014   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3015   // Back-up string to avoid reading beyond string.
3016   lea(result, Address(result, cnt1, scale1, -16));
3017   movl(cnt1, stride);
3018   jmpb(SCAN_TO_SUBSTR);
3019 
3020   // Found a potential substr
3021   bind(FOUND_CANDIDATE);
3022   // After pcmpestri tmp(rcx) contains matched element index
3023 
3024   // Make sure string is still long enough
3025   subl(cnt1, tmp);
3026   cmpl(cnt1, cnt2);
3027   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3028   // Left less then substring.
3029 
3030   bind(RET_NOT_FOUND);
3031   movl(result, -1);
3032   jmp(CLEANUP);
3033 
3034   bind(FOUND_SUBSTR);
3035   // Compute start addr of substr
3036   lea(result, Address(result, tmp, scale1));
3037   if (int_cnt2 > 0) { // Constant substring
3038     // Repeat search for small substring (< 8 chars)
3039     // from new point without reloading substring.
3040     // Have to check that we don't read beyond string.
3041     cmpl(tmp, stride-int_cnt2);
3042     jccb(Assembler::greater, ADJUST_STR);
3043     // Fall through if matched whole substring.
3044   } else { // non constant
3045     assert(int_cnt2 == -1, "should be != 0");
3046 
3047     addl(tmp, cnt2);
3048     // Found result if we matched whole substring.
3049     cmpl(tmp, stride);
3050     jcc(Assembler::lessEqual, RET_FOUND);
3051 
3052     // Repeat search for small substring (<= 8 chars)
3053     // from new point 'str1' without reloading substring.
3054     cmpl(cnt2, stride);
3055     // Have to check that we don't read beyond string.
3056     jccb(Assembler::lessEqual, ADJUST_STR);
3057 
3058     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3059     // Compare the rest of substring (> 8 chars).
3060     movptr(str1, result);
3061 
3062     cmpl(tmp, cnt2);
3063     // First 8 chars are already matched.
3064     jccb(Assembler::equal, CHECK_NEXT);
3065 
3066     bind(SCAN_SUBSTR);
3067     pcmpestri(vec, Address(str1, 0), mode);
3068     // Need to reload strings pointers if not matched whole vector
3069     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3070 
3071     bind(CHECK_NEXT);
3072     subl(cnt2, stride);
3073     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3074     addptr(str1, 16);
3075     if (ae == StrIntrinsicNode::UL) {
3076       addptr(str2, 8);
3077     } else {
3078       addptr(str2, 16);
3079     }
3080     subl(cnt1, stride);
3081     cmpl(cnt2, stride); // Do not read beyond substring
3082     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3083     // Back-up strings to avoid reading beyond substring.
3084 
3085     if (ae == StrIntrinsicNode::UL) {
3086       lea(str2, Address(str2, cnt2, scale2, -8));
3087       lea(str1, Address(str1, cnt2, scale1, -16));
3088     } else {
3089       lea(str2, Address(str2, cnt2, scale2, -16));
3090       lea(str1, Address(str1, cnt2, scale1, -16));
3091     }
3092     subl(cnt1, cnt2);
3093     movl(cnt2, stride);
3094     addl(cnt1, stride);
3095     bind(CONT_SCAN_SUBSTR);
3096     if (ae == StrIntrinsicNode::UL) {
3097       pmovzxbw(vec, Address(str2, 0));
3098     } else {
3099       movdqu(vec, Address(str2, 0));
3100     }
3101     jmp(SCAN_SUBSTR);
3102 
3103     bind(RET_FOUND_LONG);
3104     movptr(str1, Address(rsp, wordSize));
3105   } // non constant
3106 
3107   bind(RET_FOUND);
3108   // Compute substr offset
3109   subptr(result, str1);
3110   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3111     shrl(result, 1); // index
3112   }
3113   bind(CLEANUP);
3114   pop(rsp); // restore SP
3115 
3116 } // string_indexof
3117 
3118 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3119                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3120   ShortBranchVerifier sbv(this);
3121   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3122 
3123   int stride = 8;
3124 
3125   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3126         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3127         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3128         FOUND_SEQ_CHAR, DONE_LABEL;
3129 
3130   movptr(result, str1);
3131   if (UseAVX >= 2) {
3132     cmpl(cnt1, stride);
3133     jcc(Assembler::less, SCAN_TO_CHAR);
3134     cmpl(cnt1, 2*stride);
3135     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3136     movdl(vec1, ch);
3137     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3138     vpxor(vec2, vec2);
3139     movl(tmp, cnt1);
3140     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3141     andl(cnt1,0x0000000F);  //tail count (in chars)
3142 
3143     bind(SCAN_TO_16_CHAR_LOOP);
3144     vmovdqu(vec3, Address(result, 0));
3145     vpcmpeqw(vec3, vec3, vec1, 1);
3146     vptest(vec2, vec3);
3147     jcc(Assembler::carryClear, FOUND_CHAR);
3148     addptr(result, 32);
3149     subl(tmp, 2*stride);
3150     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3151     jmp(SCAN_TO_8_CHAR);
3152     bind(SCAN_TO_8_CHAR_INIT);
3153     movdl(vec1, ch);
3154     pshuflw(vec1, vec1, 0x00);
3155     pshufd(vec1, vec1, 0);
3156     pxor(vec2, vec2);
3157   }
3158   bind(SCAN_TO_8_CHAR);
3159   cmpl(cnt1, stride);
3160   jcc(Assembler::less, SCAN_TO_CHAR);
3161   if (UseAVX < 2) {
3162     movdl(vec1, ch);
3163     pshuflw(vec1, vec1, 0x00);
3164     pshufd(vec1, vec1, 0);
3165     pxor(vec2, vec2);
3166   }
3167   movl(tmp, cnt1);
3168   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3169   andl(cnt1,0x00000007);  //tail count (in chars)
3170 
3171   bind(SCAN_TO_8_CHAR_LOOP);
3172   movdqu(vec3, Address(result, 0));
3173   pcmpeqw(vec3, vec1);
3174   ptest(vec2, vec3);
3175   jcc(Assembler::carryClear, FOUND_CHAR);
3176   addptr(result, 16);
3177   subl(tmp, stride);
3178   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3179   bind(SCAN_TO_CHAR);
3180   testl(cnt1, cnt1);
3181   jcc(Assembler::zero, RET_NOT_FOUND);
3182   bind(SCAN_TO_CHAR_LOOP);
3183   load_unsigned_short(tmp, Address(result, 0));
3184   cmpl(ch, tmp);
3185   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3186   addptr(result, 2);
3187   subl(cnt1, 1);
3188   jccb(Assembler::zero, RET_NOT_FOUND);
3189   jmp(SCAN_TO_CHAR_LOOP);
3190 
3191   bind(RET_NOT_FOUND);
3192   movl(result, -1);
3193   jmpb(DONE_LABEL);
3194 
3195   bind(FOUND_CHAR);
3196   if (UseAVX >= 2) {
3197     vpmovmskb(tmp, vec3);
3198   } else {
3199     pmovmskb(tmp, vec3);
3200   }
3201   bsfl(ch, tmp);
3202   addptr(result, ch);
3203 
3204   bind(FOUND_SEQ_CHAR);
3205   subptr(result, str1);
3206   shrl(result, 1);
3207 
3208   bind(DONE_LABEL);
3209 } // string_indexof_char
3210 
3211 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3212                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3213   ShortBranchVerifier sbv(this);
3214   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3215 
3216   int stride = 16;
3217 
3218   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3219         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3220         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3221         FOUND_SEQ_CHAR, DONE_LABEL;
3222 
3223   movptr(result, str1);
3224   if (UseAVX >= 2) {
3225     cmpl(cnt1, stride);
3226     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3227     cmpl(cnt1, stride*2);
3228     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3229     movdl(vec1, ch);
3230     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3231     vpxor(vec2, vec2);
3232     movl(tmp, cnt1);
3233     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3234     andl(cnt1,0x0000001F);  //tail count (in chars)
3235 
3236     bind(SCAN_TO_32_CHAR_LOOP);
3237     vmovdqu(vec3, Address(result, 0));
3238     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3239     vptest(vec2, vec3);
3240     jcc(Assembler::carryClear, FOUND_CHAR);
3241     addptr(result, 32);
3242     subl(tmp, stride*2);
3243     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3244     jmp(SCAN_TO_16_CHAR);
3245 
3246     bind(SCAN_TO_16_CHAR_INIT);
3247     movdl(vec1, ch);
3248     pxor(vec2, vec2);
3249     pshufb(vec1, vec2);
3250   }
3251 
3252   bind(SCAN_TO_16_CHAR);
3253   cmpl(cnt1, stride);
3254   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entires left
3255   if (UseAVX < 2) {
3256     movdl(vec1, ch);
3257     pxor(vec2, vec2);
3258     pshufb(vec1, vec2);
3259   }
3260   movl(tmp, cnt1);
3261   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3262   andl(cnt1,0x0000000F);  //tail count (in bytes)
3263 
3264   bind(SCAN_TO_16_CHAR_LOOP);
3265   movdqu(vec3, Address(result, 0));
3266   pcmpeqb(vec3, vec1);
3267   ptest(vec2, vec3);
3268   jcc(Assembler::carryClear, FOUND_CHAR);
3269   addptr(result, 16);
3270   subl(tmp, stride);
3271   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3272 
3273   bind(SCAN_TO_CHAR_INIT);
3274   testl(cnt1, cnt1);
3275   jcc(Assembler::zero, RET_NOT_FOUND);
3276   bind(SCAN_TO_CHAR_LOOP);
3277   load_unsigned_byte(tmp, Address(result, 0));
3278   cmpl(ch, tmp);
3279   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3280   addptr(result, 1);
3281   subl(cnt1, 1);
3282   jccb(Assembler::zero, RET_NOT_FOUND);
3283   jmp(SCAN_TO_CHAR_LOOP);
3284 
3285   bind(RET_NOT_FOUND);
3286   movl(result, -1);
3287   jmpb(DONE_LABEL);
3288 
3289   bind(FOUND_CHAR);
3290   if (UseAVX >= 2) {
3291     vpmovmskb(tmp, vec3);
3292   } else {
3293     pmovmskb(tmp, vec3);
3294   }
3295   bsfl(ch, tmp);
3296   addptr(result, ch);
3297 
3298   bind(FOUND_SEQ_CHAR);
3299   subptr(result, str1);
3300 
3301   bind(DONE_LABEL);
3302 } // stringL_indexof_char
3303 
3304 // helper function for string_compare
3305 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3306                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3307                                            Address::ScaleFactor scale2, Register index, int ae) {
3308   if (ae == StrIntrinsicNode::LL) {
3309     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3310     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3311   } else if (ae == StrIntrinsicNode::UU) {
3312     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3313     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3314   } else {
3315     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3316     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3317   }
3318 }
3319 
3320 // Compare strings, used for char[] and byte[].
3321 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3322                                        Register cnt1, Register cnt2, Register result,
3323                                        XMMRegister vec1, int ae, KRegister mask) {
3324   ShortBranchVerifier sbv(this);
3325   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3326   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3327   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3328   int stride2x2 = 0x40;
3329   Address::ScaleFactor scale = Address::no_scale;
3330   Address::ScaleFactor scale1 = Address::no_scale;
3331   Address::ScaleFactor scale2 = Address::no_scale;
3332 
3333   if (ae != StrIntrinsicNode::LL) {
3334     stride2x2 = 0x20;
3335   }
3336 
3337   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3338     shrl(cnt2, 1);
3339   }
3340   // Compute the minimum of the string lengths and the
3341   // difference of the string lengths (stack).
3342   // Do the conditional move stuff
3343   movl(result, cnt1);
3344   subl(cnt1, cnt2);
3345   push(cnt1);
3346   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3347 
3348   // Is the minimum length zero?
3349   testl(cnt2, cnt2);
3350   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3351   if (ae == StrIntrinsicNode::LL) {
3352     // Load first bytes
3353     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3354     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3355   } else if (ae == StrIntrinsicNode::UU) {
3356     // Load first characters
3357     load_unsigned_short(result, Address(str1, 0));
3358     load_unsigned_short(cnt1, Address(str2, 0));
3359   } else {
3360     load_unsigned_byte(result, Address(str1, 0));
3361     load_unsigned_short(cnt1, Address(str2, 0));
3362   }
3363   subl(result, cnt1);
3364   jcc(Assembler::notZero,  POP_LABEL);
3365 
3366   if (ae == StrIntrinsicNode::UU) {
3367     // Divide length by 2 to get number of chars
3368     shrl(cnt2, 1);
3369   }
3370   cmpl(cnt2, 1);
3371   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3372 
3373   // Check if the strings start at the same location and setup scale and stride
3374   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3375     cmpptr(str1, str2);
3376     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3377     if (ae == StrIntrinsicNode::LL) {
3378       scale = Address::times_1;
3379       stride = 16;
3380     } else {
3381       scale = Address::times_2;
3382       stride = 8;
3383     }
3384   } else {
3385     scale1 = Address::times_1;
3386     scale2 = Address::times_2;
3387     // scale not used
3388     stride = 8;
3389   }
3390 
3391   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3392     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3393     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3394     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3395     Label COMPARE_TAIL_LONG;
3396     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3397 
3398     int pcmpmask = 0x19;
3399     if (ae == StrIntrinsicNode::LL) {
3400       pcmpmask &= ~0x01;
3401     }
3402 
3403     // Setup to compare 16-chars (32-bytes) vectors,
3404     // start from first character again because it has aligned address.
3405     if (ae == StrIntrinsicNode::LL) {
3406       stride2 = 32;
3407     } else {
3408       stride2 = 16;
3409     }
3410     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3411       adr_stride = stride << scale;
3412     } else {
3413       adr_stride1 = 8;  //stride << scale1;
3414       adr_stride2 = 16; //stride << scale2;
3415     }
3416 
3417     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3418     // rax and rdx are used by pcmpestri as elements counters
3419     movl(result, cnt2);
3420     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3421     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3422 
3423     // fast path : compare first 2 8-char vectors.
3424     bind(COMPARE_16_CHARS);
3425     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3426       movdqu(vec1, Address(str1, 0));
3427     } else {
3428       pmovzxbw(vec1, Address(str1, 0));
3429     }
3430     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3431     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3432 
3433     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3434       movdqu(vec1, Address(str1, adr_stride));
3435       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3436     } else {
3437       pmovzxbw(vec1, Address(str1, adr_stride1));
3438       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3439     }
3440     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3441     addl(cnt1, stride);
3442 
3443     // Compare the characters at index in cnt1
3444     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3445     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3446     subl(result, cnt2);
3447     jmp(POP_LABEL);
3448 
3449     // Setup the registers to start vector comparison loop
3450     bind(COMPARE_WIDE_VECTORS);
3451     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3452       lea(str1, Address(str1, result, scale));
3453       lea(str2, Address(str2, result, scale));
3454     } else {
3455       lea(str1, Address(str1, result, scale1));
3456       lea(str2, Address(str2, result, scale2));
3457     }
3458     subl(result, stride2);
3459     subl(cnt2, stride2);
3460     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3461     negptr(result);
3462 
3463     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3464     bind(COMPARE_WIDE_VECTORS_LOOP);
3465 
3466 #ifdef _LP64
3467     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3468       cmpl(cnt2, stride2x2);
3469       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3470       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3471       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3472 
3473       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3474       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3475         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3476         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3477       } else {
3478         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3479         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3480       }
3481       kortestql(mask, mask);
3482       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3483       addptr(result, stride2x2);  // update since we already compared at this addr
3484       subl(cnt2, stride2x2);      // and sub the size too
3485       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3486 
3487       vpxor(vec1, vec1);
3488       jmpb(COMPARE_WIDE_TAIL);
3489     }//if (VM_Version::supports_avx512vlbw())
3490 #endif // _LP64
3491 
3492 
3493     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3494     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3495       vmovdqu(vec1, Address(str1, result, scale));
3496       vpxor(vec1, Address(str2, result, scale));
3497     } else {
3498       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3499       vpxor(vec1, Address(str2, result, scale2));
3500     }
3501     vptest(vec1, vec1);
3502     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3503     addptr(result, stride2);
3504     subl(cnt2, stride2);
3505     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3506     // clean upper bits of YMM registers
3507     vpxor(vec1, vec1);
3508 
3509     // compare wide vectors tail
3510     bind(COMPARE_WIDE_TAIL);
3511     testptr(result, result);
3512     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3513 
3514     movl(result, stride2);
3515     movl(cnt2, result);
3516     negptr(result);
3517     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3518 
3519     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3520     bind(VECTOR_NOT_EQUAL);
3521     // clean upper bits of YMM registers
3522     vpxor(vec1, vec1);
3523     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3524       lea(str1, Address(str1, result, scale));
3525       lea(str2, Address(str2, result, scale));
3526     } else {
3527       lea(str1, Address(str1, result, scale1));
3528       lea(str2, Address(str2, result, scale2));
3529     }
3530     jmp(COMPARE_16_CHARS);
3531 
3532     // Compare tail chars, length between 1 to 15 chars
3533     bind(COMPARE_TAIL_LONG);
3534     movl(cnt2, result);
3535     cmpl(cnt2, stride);
3536     jcc(Assembler::less, COMPARE_SMALL_STR);
3537 
3538     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3539       movdqu(vec1, Address(str1, 0));
3540     } else {
3541       pmovzxbw(vec1, Address(str1, 0));
3542     }
3543     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3544     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3545     subptr(cnt2, stride);
3546     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3547     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3548       lea(str1, Address(str1, result, scale));
3549       lea(str2, Address(str2, result, scale));
3550     } else {
3551       lea(str1, Address(str1, result, scale1));
3552       lea(str2, Address(str2, result, scale2));
3553     }
3554     negptr(cnt2);
3555     jmpb(WHILE_HEAD_LABEL);
3556 
3557     bind(COMPARE_SMALL_STR);
3558   } else if (UseSSE42Intrinsics) {
3559     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3560     int pcmpmask = 0x19;
3561     // Setup to compare 8-char (16-byte) vectors,
3562     // start from first character again because it has aligned address.
3563     movl(result, cnt2);
3564     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3565     if (ae == StrIntrinsicNode::LL) {
3566       pcmpmask &= ~0x01;
3567     }
3568     jcc(Assembler::zero, COMPARE_TAIL);
3569     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3570       lea(str1, Address(str1, result, scale));
3571       lea(str2, Address(str2, result, scale));
3572     } else {
3573       lea(str1, Address(str1, result, scale1));
3574       lea(str2, Address(str2, result, scale2));
3575     }
3576     negptr(result);
3577 
3578     // pcmpestri
3579     //   inputs:
3580     //     vec1- substring
3581     //     rax - negative string length (elements count)
3582     //     mem - scanned string
3583     //     rdx - string length (elements count)
3584     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3585     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3586     //   outputs:
3587     //     rcx - first mismatched element index
3588     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3589 
3590     bind(COMPARE_WIDE_VECTORS);
3591     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3592       movdqu(vec1, Address(str1, result, scale));
3593       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3594     } else {
3595       pmovzxbw(vec1, Address(str1, result, scale1));
3596       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3597     }
3598     // After pcmpestri cnt1(rcx) contains mismatched element index
3599 
3600     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3601     addptr(result, stride);
3602     subptr(cnt2, stride);
3603     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3604 
3605     // compare wide vectors tail
3606     testptr(result, result);
3607     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3608 
3609     movl(cnt2, stride);
3610     movl(result, stride);
3611     negptr(result);
3612     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3613       movdqu(vec1, Address(str1, result, scale));
3614       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3615     } else {
3616       pmovzxbw(vec1, Address(str1, result, scale1));
3617       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3618     }
3619     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3620 
3621     // Mismatched characters in the vectors
3622     bind(VECTOR_NOT_EQUAL);
3623     addptr(cnt1, result);
3624     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3625     subl(result, cnt2);
3626     jmpb(POP_LABEL);
3627 
3628     bind(COMPARE_TAIL); // limit is zero
3629     movl(cnt2, result);
3630     // Fallthru to tail compare
3631   }
3632   // Shift str2 and str1 to the end of the arrays, negate min
3633   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3634     lea(str1, Address(str1, cnt2, scale));
3635     lea(str2, Address(str2, cnt2, scale));
3636   } else {
3637     lea(str1, Address(str1, cnt2, scale1));
3638     lea(str2, Address(str2, cnt2, scale2));
3639   }
3640   decrementl(cnt2);  // first character was compared already
3641   negptr(cnt2);
3642 
3643   // Compare the rest of the elements
3644   bind(WHILE_HEAD_LABEL);
3645   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3646   subl(result, cnt1);
3647   jccb(Assembler::notZero, POP_LABEL);
3648   increment(cnt2);
3649   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3650 
3651   // Strings are equal up to min length.  Return the length difference.
3652   bind(LENGTH_DIFF_LABEL);
3653   pop(result);
3654   if (ae == StrIntrinsicNode::UU) {
3655     // Divide diff by 2 to get number of chars
3656     sarl(result, 1);
3657   }
3658   jmpb(DONE_LABEL);
3659 
3660 #ifdef _LP64
3661   if (VM_Version::supports_avx512vlbw()) {
3662 
3663     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3664 
3665     kmovql(cnt1, mask);
3666     notq(cnt1);
3667     bsfq(cnt2, cnt1);
3668     if (ae != StrIntrinsicNode::LL) {
3669       // Divide diff by 2 to get number of chars
3670       sarl(cnt2, 1);
3671     }
3672     addq(result, cnt2);
3673     if (ae == StrIntrinsicNode::LL) {
3674       load_unsigned_byte(cnt1, Address(str2, result));
3675       load_unsigned_byte(result, Address(str1, result));
3676     } else if (ae == StrIntrinsicNode::UU) {
3677       load_unsigned_short(cnt1, Address(str2, result, scale));
3678       load_unsigned_short(result, Address(str1, result, scale));
3679     } else {
3680       load_unsigned_short(cnt1, Address(str2, result, scale2));
3681       load_unsigned_byte(result, Address(str1, result, scale1));
3682     }
3683     subl(result, cnt1);
3684     jmpb(POP_LABEL);
3685   }//if (VM_Version::supports_avx512vlbw())
3686 #endif // _LP64
3687 
3688   // Discard the stored length difference
3689   bind(POP_LABEL);
3690   pop(cnt1);
3691 
3692   // That's it
3693   bind(DONE_LABEL);
3694   if(ae == StrIntrinsicNode::UL) {
3695     negl(result);
3696   }
3697 
3698 }
3699 
3700 // Search for Non-ASCII character (Negative byte value) in a byte array,
3701 // return true if it has any and false otherwise.
3702 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3703 //   @IntrinsicCandidate
3704 //   private static boolean hasNegatives(byte[] ba, int off, int len) {
3705 //     for (int i = off; i < off + len; i++) {
3706 //       if (ba[i] < 0) {
3707 //         return true;
3708 //       }
3709 //     }
3710 //     return false;
3711 //   }
3712 void C2_MacroAssembler::has_negatives(Register ary1, Register len,
3713   Register result, Register tmp1,
3714   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3715   // rsi: byte array
3716   // rcx: len
3717   // rax: result
3718   ShortBranchVerifier sbv(this);
3719   assert_different_registers(ary1, len, result, tmp1);
3720   assert_different_registers(vec1, vec2);
3721   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3722 
3723   // len == 0
3724   testl(len, len);
3725   jcc(Assembler::zero, FALSE_LABEL);
3726 
3727   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3728     VM_Version::supports_avx512vlbw() &&
3729     VM_Version::supports_bmi2()) {
3730 
3731     Label test_64_loop, test_tail;
3732     Register tmp3_aliased = len;
3733 
3734     movl(tmp1, len);
3735     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3736 
3737     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
3738     andl(len, ~(64 - 1));    // vector count (in chars)
3739     jccb(Assembler::zero, test_tail);
3740 
3741     lea(ary1, Address(ary1, len, Address::times_1));
3742     negptr(len);
3743 
3744     bind(test_64_loop);
3745     // Check whether our 64 elements of size byte contain negatives
3746     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3747     kortestql(mask1, mask1);
3748     jcc(Assembler::notZero, TRUE_LABEL);
3749 
3750     addptr(len, 64);
3751     jccb(Assembler::notZero, test_64_loop);
3752 
3753 
3754     bind(test_tail);
3755     // bail out when there is nothing to be done
3756     testl(tmp1, -1);
3757     jcc(Assembler::zero, FALSE_LABEL);
3758 
3759     // ~(~0 << len) applied up to two times (for 32-bit scenario)
3760 #ifdef _LP64
3761     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3762     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3763     notq(tmp3_aliased);
3764     kmovql(mask2, tmp3_aliased);
3765 #else
3766     Label k_init;
3767     jmp(k_init);
3768 
3769     // We could not read 64-bits from a general purpose register thus we move
3770     // data required to compose 64 1's to the instruction stream
3771     // We emit 64 byte wide series of elements from 0..63 which later on would
3772     // be used as a compare targets with tail count contained in tmp1 register.
3773     // Result would be a k register having tmp1 consecutive number or 1
3774     // counting from least significant bit.
3775     address tmp = pc();
3776     emit_int64(0x0706050403020100);
3777     emit_int64(0x0F0E0D0C0B0A0908);
3778     emit_int64(0x1716151413121110);
3779     emit_int64(0x1F1E1D1C1B1A1918);
3780     emit_int64(0x2726252423222120);
3781     emit_int64(0x2F2E2D2C2B2A2928);
3782     emit_int64(0x3736353433323130);
3783     emit_int64(0x3F3E3D3C3B3A3938);
3784 
3785     bind(k_init);
3786     lea(len, InternalAddress(tmp));
3787     // create mask to test for negative byte inside a vector
3788     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3789     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
3790 
3791 #endif
3792     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3793     ktestq(mask1, mask2);
3794     jcc(Assembler::notZero, TRUE_LABEL);
3795 
3796     jmp(FALSE_LABEL);
3797   } else {
3798     movl(result, len); // copy
3799 
3800     if (UseAVX >= 2 && UseSSE >= 2) {
3801       // With AVX2, use 32-byte vector compare
3802       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3803 
3804       // Compare 32-byte vectors
3805       andl(result, 0x0000001f);  //   tail count (in bytes)
3806       andl(len, 0xffffffe0);   // vector count (in bytes)
3807       jccb(Assembler::zero, COMPARE_TAIL);
3808 
3809       lea(ary1, Address(ary1, len, Address::times_1));
3810       negptr(len);
3811 
3812       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
3813       movdl(vec2, tmp1);
3814       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
3815 
3816       bind(COMPARE_WIDE_VECTORS);
3817       vmovdqu(vec1, Address(ary1, len, Address::times_1));
3818       vptest(vec1, vec2);
3819       jccb(Assembler::notZero, TRUE_LABEL);
3820       addptr(len, 32);
3821       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3822 
3823       testl(result, result);
3824       jccb(Assembler::zero, FALSE_LABEL);
3825 
3826       vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3827       vptest(vec1, vec2);
3828       jccb(Assembler::notZero, TRUE_LABEL);
3829       jmpb(FALSE_LABEL);
3830 
3831       bind(COMPARE_TAIL); // len is zero
3832       movl(len, result);
3833       // Fallthru to tail compare
3834     } else if (UseSSE42Intrinsics) {
3835       // With SSE4.2, use double quad vector compare
3836       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3837 
3838       // Compare 16-byte vectors
3839       andl(result, 0x0000000f);  //   tail count (in bytes)
3840       andl(len, 0xfffffff0);   // vector count (in bytes)
3841       jcc(Assembler::zero, COMPARE_TAIL);
3842 
3843       lea(ary1, Address(ary1, len, Address::times_1));
3844       negptr(len);
3845 
3846       movl(tmp1, 0x80808080);
3847       movdl(vec2, tmp1);
3848       pshufd(vec2, vec2, 0);
3849 
3850       bind(COMPARE_WIDE_VECTORS);
3851       movdqu(vec1, Address(ary1, len, Address::times_1));
3852       ptest(vec1, vec2);
3853       jcc(Assembler::notZero, TRUE_LABEL);
3854       addptr(len, 16);
3855       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3856 
3857       testl(result, result);
3858       jcc(Assembler::zero, FALSE_LABEL);
3859 
3860       movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3861       ptest(vec1, vec2);
3862       jccb(Assembler::notZero, TRUE_LABEL);
3863       jmpb(FALSE_LABEL);
3864 
3865       bind(COMPARE_TAIL); // len is zero
3866       movl(len, result);
3867       // Fallthru to tail compare
3868     }
3869   }
3870   // Compare 4-byte vectors
3871   andl(len, 0xfffffffc); // vector count (in bytes)
3872   jccb(Assembler::zero, COMPARE_CHAR);
3873 
3874   lea(ary1, Address(ary1, len, Address::times_1));
3875   negptr(len);
3876 
3877   bind(COMPARE_VECTORS);
3878   movl(tmp1, Address(ary1, len, Address::times_1));
3879   andl(tmp1, 0x80808080);
3880   jccb(Assembler::notZero, TRUE_LABEL);
3881   addptr(len, 4);
3882   jcc(Assembler::notZero, COMPARE_VECTORS);
3883 
3884   // Compare trailing char (final 2 bytes), if any
3885   bind(COMPARE_CHAR);
3886   testl(result, 0x2);   // tail  char
3887   jccb(Assembler::zero, COMPARE_BYTE);
3888   load_unsigned_short(tmp1, Address(ary1, 0));
3889   andl(tmp1, 0x00008080);
3890   jccb(Assembler::notZero, TRUE_LABEL);
3891   subptr(result, 2);
3892   lea(ary1, Address(ary1, 2));
3893 
3894   bind(COMPARE_BYTE);
3895   testl(result, 0x1);   // tail  byte
3896   jccb(Assembler::zero, FALSE_LABEL);
3897   load_unsigned_byte(tmp1, Address(ary1, 0));
3898   andl(tmp1, 0x00000080);
3899   jccb(Assembler::notEqual, TRUE_LABEL);
3900   jmpb(FALSE_LABEL);
3901 
3902   bind(TRUE_LABEL);
3903   movl(result, 1);   // return true
3904   jmpb(DONE);
3905 
3906   bind(FALSE_LABEL);
3907   xorl(result, result); // return false
3908 
3909   // That's it
3910   bind(DONE);
3911   if (UseAVX >= 2 && UseSSE >= 2) {
3912     // clean upper bits of YMM registers
3913     vpxor(vec1, vec1);
3914     vpxor(vec2, vec2);
3915   }
3916 }
3917 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
3918 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
3919                                       Register limit, Register result, Register chr,
3920                                       XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
3921   ShortBranchVerifier sbv(this);
3922   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
3923 
3924   int length_offset  = arrayOopDesc::length_offset_in_bytes();
3925   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
3926 
3927   if (is_array_equ) {
3928     // Check the input args
3929     cmpoop(ary1, ary2);
3930     jcc(Assembler::equal, TRUE_LABEL);
3931 
3932     // Need additional checks for arrays_equals.
3933     testptr(ary1, ary1);
3934     jcc(Assembler::zero, FALSE_LABEL);
3935     testptr(ary2, ary2);
3936     jcc(Assembler::zero, FALSE_LABEL);
3937 
3938     // Check the lengths
3939     movl(limit, Address(ary1, length_offset));
3940     cmpl(limit, Address(ary2, length_offset));
3941     jcc(Assembler::notEqual, FALSE_LABEL);
3942   }
3943 
3944   // count == 0
3945   testl(limit, limit);
3946   jcc(Assembler::zero, TRUE_LABEL);
3947 
3948   if (is_array_equ) {
3949     // Load array address
3950     lea(ary1, Address(ary1, base_offset));
3951     lea(ary2, Address(ary2, base_offset));
3952   }
3953 
3954   if (is_array_equ && is_char) {
3955     // arrays_equals when used for char[].
3956     shll(limit, 1);      // byte count != 0
3957   }
3958   movl(result, limit); // copy
3959 
3960   if (UseAVX >= 2) {
3961     // With AVX2, use 32-byte vector compare
3962     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3963 
3964     // Compare 32-byte vectors
3965     andl(result, 0x0000001f);  //   tail count (in bytes)
3966     andl(limit, 0xffffffe0);   // vector count (in bytes)
3967     jcc(Assembler::zero, COMPARE_TAIL);
3968 
3969     lea(ary1, Address(ary1, limit, Address::times_1));
3970     lea(ary2, Address(ary2, limit, Address::times_1));
3971     negptr(limit);
3972 
3973 #ifdef _LP64
3974     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3975       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
3976 
3977       cmpl(limit, -64);
3978       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3979 
3980       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3981 
3982       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
3983       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
3984       kortestql(mask, mask);
3985       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3986       addptr(limit, 64);  // update since we already compared at this addr
3987       cmpl(limit, -64);
3988       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3989 
3990       // At this point we may still need to compare -limit+result bytes.
3991       // We could execute the next two instruction and just continue via non-wide path:
3992       //  cmpl(limit, 0);
3993       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
3994       // But since we stopped at the points ary{1,2}+limit which are
3995       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
3996       // (|limit| <= 32 and result < 32),
3997       // we may just compare the last 64 bytes.
3998       //
3999       addptr(result, -64);   // it is safe, bc we just came from this area
4000       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4001       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4002       kortestql(mask, mask);
4003       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4004 
4005       jmp(TRUE_LABEL);
4006 
4007       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4008 
4009     }//if (VM_Version::supports_avx512vlbw())
4010 #endif //_LP64
4011     bind(COMPARE_WIDE_VECTORS);
4012     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
4013     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4014     vpxor(vec1, vec2);
4015 
4016     vptest(vec1, vec1);
4017     jcc(Assembler::notZero, FALSE_LABEL);
4018     addptr(limit, 32);
4019     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4020 
4021     testl(result, result);
4022     jcc(Assembler::zero, TRUE_LABEL);
4023 
4024     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
4025     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4026     vpxor(vec1, vec2);
4027 
4028     vptest(vec1, vec1);
4029     jccb(Assembler::notZero, FALSE_LABEL);
4030     jmpb(TRUE_LABEL);
4031 
4032     bind(COMPARE_TAIL); // limit is zero
4033     movl(limit, result);
4034     // Fallthru to tail compare
4035   } else if (UseSSE42Intrinsics) {
4036     // With SSE4.2, use double quad vector compare
4037     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4038 
4039     // Compare 16-byte vectors
4040     andl(result, 0x0000000f);  //   tail count (in bytes)
4041     andl(limit, 0xfffffff0);   // vector count (in bytes)
4042     jcc(Assembler::zero, COMPARE_TAIL);
4043 
4044     lea(ary1, Address(ary1, limit, Address::times_1));
4045     lea(ary2, Address(ary2, limit, Address::times_1));
4046     negptr(limit);
4047 
4048     bind(COMPARE_WIDE_VECTORS);
4049     movdqu(vec1, Address(ary1, limit, Address::times_1));
4050     movdqu(vec2, Address(ary2, limit, Address::times_1));
4051     pxor(vec1, vec2);
4052 
4053     ptest(vec1, vec1);
4054     jcc(Assembler::notZero, FALSE_LABEL);
4055     addptr(limit, 16);
4056     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4057 
4058     testl(result, result);
4059     jcc(Assembler::zero, TRUE_LABEL);
4060 
4061     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4062     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4063     pxor(vec1, vec2);
4064 
4065     ptest(vec1, vec1);
4066     jccb(Assembler::notZero, FALSE_LABEL);
4067     jmpb(TRUE_LABEL);
4068 
4069     bind(COMPARE_TAIL); // limit is zero
4070     movl(limit, result);
4071     // Fallthru to tail compare
4072   }
4073 
4074   // Compare 4-byte vectors
4075   andl(limit, 0xfffffffc); // vector count (in bytes)
4076   jccb(Assembler::zero, COMPARE_CHAR);
4077 
4078   lea(ary1, Address(ary1, limit, Address::times_1));
4079   lea(ary2, Address(ary2, limit, Address::times_1));
4080   negptr(limit);
4081 
4082   bind(COMPARE_VECTORS);
4083   movl(chr, Address(ary1, limit, Address::times_1));
4084   cmpl(chr, Address(ary2, limit, Address::times_1));
4085   jccb(Assembler::notEqual, FALSE_LABEL);
4086   addptr(limit, 4);
4087   jcc(Assembler::notZero, COMPARE_VECTORS);
4088 
4089   // Compare trailing char (final 2 bytes), if any
4090   bind(COMPARE_CHAR);
4091   testl(result, 0x2);   // tail  char
4092   jccb(Assembler::zero, COMPARE_BYTE);
4093   load_unsigned_short(chr, Address(ary1, 0));
4094   load_unsigned_short(limit, Address(ary2, 0));
4095   cmpl(chr, limit);
4096   jccb(Assembler::notEqual, FALSE_LABEL);
4097 
4098   if (is_array_equ && is_char) {
4099     bind(COMPARE_BYTE);
4100   } else {
4101     lea(ary1, Address(ary1, 2));
4102     lea(ary2, Address(ary2, 2));
4103 
4104     bind(COMPARE_BYTE);
4105     testl(result, 0x1);   // tail  byte
4106     jccb(Assembler::zero, TRUE_LABEL);
4107     load_unsigned_byte(chr, Address(ary1, 0));
4108     load_unsigned_byte(limit, Address(ary2, 0));
4109     cmpl(chr, limit);
4110     jccb(Assembler::notEqual, FALSE_LABEL);
4111   }
4112   bind(TRUE_LABEL);
4113   movl(result, 1);   // return true
4114   jmpb(DONE);
4115 
4116   bind(FALSE_LABEL);
4117   xorl(result, result); // return false
4118 
4119   // That's it
4120   bind(DONE);
4121   if (UseAVX >= 2) {
4122     // clean upper bits of YMM registers
4123     vpxor(vec1, vec1);
4124     vpxor(vec2, vec2);
4125   }
4126 }
4127 
4128 #ifdef _LP64
4129 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
4130                                               Register tmp, KRegister ktmp, int masklen, int vec_enc) {
4131   assert(VM_Version::supports_avx512vlbw(), "");
4132   vpxor(xtmp, xtmp, xtmp, vec_enc);
4133   vpsubb(xtmp, xtmp, mask, vec_enc);
4134   evpmovb2m(ktmp, xtmp, vec_enc);
4135   kmovql(tmp, ktmp);
4136   switch(opc) {
4137     case Op_VectorMaskTrueCount:
4138       popcntq(dst, tmp);
4139       break;
4140     case Op_VectorMaskLastTrue:
4141       mov64(dst, -1);
4142       bsrq(tmp, tmp);
4143       cmov(Assembler::notZero, dst, tmp);
4144       break;
4145     case Op_VectorMaskFirstTrue:
4146       mov64(dst, masklen);
4147       bsfq(tmp, tmp);
4148       cmov(Assembler::notZero, dst, tmp);
4149       break;
4150     default: assert(false, "Unhandled mask operation");
4151   }
4152 }
4153 
4154 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
4155                                               XMMRegister xtmp1, Register tmp, int masklen, int vec_enc) {
4156   assert(VM_Version::supports_avx(), "");
4157   vpxor(xtmp, xtmp, xtmp, vec_enc);
4158   vpsubb(xtmp, xtmp, mask, vec_enc);
4159   vpmovmskb(tmp, xtmp, vec_enc);
4160   if (masklen < 64) {
4161     andq(tmp, (((jlong)1 << masklen) - 1));
4162   }
4163   switch(opc) {
4164     case Op_VectorMaskTrueCount:
4165       popcntq(dst, tmp);
4166       break;
4167     case Op_VectorMaskLastTrue:
4168       mov64(dst, -1);
4169       bsrq(tmp, tmp);
4170       cmov(Assembler::notZero, dst, tmp);
4171       break;
4172     case Op_VectorMaskFirstTrue:
4173       mov64(dst, masklen);
4174       bsfq(tmp, tmp);
4175       cmov(Assembler::notZero, dst, tmp);
4176       break;
4177     default: assert(false, "Unhandled mask operation");
4178   }
4179 }
4180 #endif
4181 
4182 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
4183                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
4184                                         int vlen_enc) {
4185   assert(VM_Version::supports_avx512bw(), "");
4186   // Byte shuffles are inlane operations and indices are determined using
4187   // lower 4 bit of each shuffle lane, thus all shuffle indices are
4188   // normalized to index range 0-15. This makes sure that all the multiples
4189   // of an index value are placed at same relative position in 128 bit
4190   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
4191   // will be 16th element in their respective 128 bit lanes.
4192   movl(rtmp, 16);
4193   evpbroadcastb(xtmp1, rtmp, vlen_enc);
4194 
4195   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
4196   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
4197   // original shuffle indices and move the shuffled lanes corresponding to true
4198   // mask to destination vector.
4199   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
4200   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
4201   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
4202 
4203   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
4204   // and broadcasting second 128 bit lane.
4205   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
4206   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
4207   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
4208   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
4209   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
4210 
4211   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
4212   // and broadcasting third 128 bit lane.
4213   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
4214   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
4215   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
4216   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
4217   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
4218 
4219   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
4220   // and broadcasting third 128 bit lane.
4221   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
4222   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
4223   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
4224   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
4225   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
4226 }
4227 
4228 #ifdef _LP64
4229 void C2_MacroAssembler::load_nklass_compact_c2(Register dst, Register obj, Register index, Address::ScaleFactor scale, int disp) {
4230   C2LoadNKlassStub* stub = new (Compile::current()->comp_arena()) C2LoadNKlassStub(dst);
4231   Compile::current()->output()->add_stub(stub);
4232 
4233   // Note: Don't clobber obj anywhere in that method!
4234 
4235   // The incoming address is pointing into obj-start + klass_offset_in_bytes. We need to extract
4236   // obj-start, so that we can load from the object's mark-word instead. Usually the address
4237   // comes as obj-start in obj and klass_offset_in_bytes in disp. However, sometimes C2
4238   // emits code that pre-computes obj-start + klass_offset_in_bytes into a register, and
4239   // then passes that register as obj and 0 in disp. The following code extracts the base
4240   // and offset to load the mark-word.
4241   int offset = oopDesc::mark_offset_in_bytes() + disp - oopDesc::klass_offset_in_bytes();
4242   movq(dst, Address(obj, index, scale, offset));
4243   testb(dst, markWord::monitor_value);
4244   jcc(Assembler::notZero, stub->entry());
4245   bind(stub->continuation());
4246   shrq(dst, markWord::klass_shift);
4247 }
4248 #endif