1 /*
   2  * Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "oops/methodData.hpp"
  29 #include "opto/c2_MacroAssembler.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/opcodes.hpp"
  32 #include "opto/subnode.hpp"
  33 #include "runtime/objectMonitor.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 
  36 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
  37   switch (vlen_in_bytes) {
  38     case  4: // fall-through
  39     case  8: // fall-through
  40     case 16: return Assembler::AVX_128bit;
  41     case 32: return Assembler::AVX_256bit;
  42     case 64: return Assembler::AVX_512bit;
  43 
  44     default: {
  45       ShouldNotReachHere();
  46       return Assembler::AVX_NoVec;
  47     }
  48   }
  49 }
  50 
  51 void C2_MacroAssembler::setvectmask(Register dst, Register src, KRegister mask) {
  52   guarantee(PostLoopMultiversioning, "must be");
  53   Assembler::movl(dst, 1);
  54   Assembler::shlxl(dst, dst, src);
  55   Assembler::decl(dst);
  56   Assembler::kmovdl(mask, dst);
  57   Assembler::movl(dst, src);
  58 }
  59 
  60 void C2_MacroAssembler::restorevectmask(KRegister mask) {
  61   guarantee(PostLoopMultiversioning, "must be");
  62   Assembler::knotwl(mask, k0);
  63 }
  64 
  65 #if INCLUDE_RTM_OPT
  66 
  67 // Update rtm_counters based on abort status
  68 // input: abort_status
  69 //        rtm_counters (RTMLockingCounters*)
  70 // flags are killed
  71 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
  72 
  73   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
  74   if (PrintPreciseRTMLockingStatistics) {
  75     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
  76       Label check_abort;
  77       testl(abort_status, (1<<i));
  78       jccb(Assembler::equal, check_abort);
  79       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
  80       bind(check_abort);
  81     }
  82   }
  83 }
  84 
  85 // Branch if (random & (count-1) != 0), count is 2^n
  86 // tmp, scr and flags are killed
  87 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
  88   assert(tmp == rax, "");
  89   assert(scr == rdx, "");
  90   rdtsc(); // modifies EDX:EAX
  91   andptr(tmp, count-1);
  92   jccb(Assembler::notZero, brLabel);
  93 }
  94 
  95 // Perform abort ratio calculation, set no_rtm bit if high ratio
  96 // input:  rtm_counters_Reg (RTMLockingCounters* address)
  97 // tmpReg, rtm_counters_Reg and flags are killed
  98 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
  99                                                     Register rtm_counters_Reg,
 100                                                     RTMLockingCounters* rtm_counters,
 101                                                     Metadata* method_data) {
 102   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 103 
 104   if (RTMLockingCalculationDelay > 0) {
 105     // Delay calculation
 106     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
 107     testptr(tmpReg, tmpReg);
 108     jccb(Assembler::equal, L_done);
 109   }
 110   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 111   //   Aborted transactions = abort_count * 100
 112   //   All transactions = total_count *  RTMTotalCountIncrRate
 113   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 114 
 115   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 116   cmpptr(tmpReg, RTMAbortThreshold);
 117   jccb(Assembler::below, L_check_always_rtm2);
 118   imulptr(tmpReg, tmpReg, 100);
 119 
 120   Register scrReg = rtm_counters_Reg;
 121   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 122   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 123   imulptr(scrReg, scrReg, RTMAbortRatio);
 124   cmpptr(tmpReg, scrReg);
 125   jccb(Assembler::below, L_check_always_rtm1);
 126   if (method_data != NULL) {
 127     // set rtm_state to "no rtm" in MDO
 128     mov_metadata(tmpReg, method_data);
 129     lock();
 130     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
 131   }
 132   jmpb(L_done);
 133   bind(L_check_always_rtm1);
 134   // Reload RTMLockingCounters* address
 135   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 136   bind(L_check_always_rtm2);
 137   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 138   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 139   jccb(Assembler::below, L_done);
 140   if (method_data != NULL) {
 141     // set rtm_state to "always rtm" in MDO
 142     mov_metadata(tmpReg, method_data);
 143     lock();
 144     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
 145   }
 146   bind(L_done);
 147 }
 148 
 149 // Update counters and perform abort ratio calculation
 150 // input:  abort_status_Reg
 151 // rtm_counters_Reg, flags are killed
 152 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 153                                       Register rtm_counters_Reg,
 154                                       RTMLockingCounters* rtm_counters,
 155                                       Metadata* method_data,
 156                                       bool profile_rtm) {
 157 
 158   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 159   // update rtm counters based on rax value at abort
 160   // reads abort_status_Reg, updates flags
 161   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 162   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 163   if (profile_rtm) {
 164     // Save abort status because abort_status_Reg is used by following code.
 165     if (RTMRetryCount > 0) {
 166       push(abort_status_Reg);
 167     }
 168     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 169     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 170     // restore abort status
 171     if (RTMRetryCount > 0) {
 172       pop(abort_status_Reg);
 173     }
 174   }
 175 }
 176 
 177 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 178 // inputs: retry_count_Reg
 179 //       : abort_status_Reg
 180 // output: retry_count_Reg decremented by 1
 181 // flags are killed
 182 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 183   Label doneRetry;
 184   assert(abort_status_Reg == rax, "");
 185   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 186   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 187   // if reason is in 0x6 and retry count != 0 then retry
 188   andptr(abort_status_Reg, 0x6);
 189   jccb(Assembler::zero, doneRetry);
 190   testl(retry_count_Reg, retry_count_Reg);
 191   jccb(Assembler::zero, doneRetry);
 192   pause();
 193   decrementl(retry_count_Reg);
 194   jmp(retryLabel);
 195   bind(doneRetry);
 196 }
 197 
 198 // Spin and retry if lock is busy,
 199 // inputs: box_Reg (monitor address)
 200 //       : retry_count_Reg
 201 // output: retry_count_Reg decremented by 1
 202 //       : clear z flag if retry count exceeded
 203 // tmp_Reg, scr_Reg, flags are killed
 204 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 205                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 206   Label SpinLoop, SpinExit, doneRetry;
 207   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 208 
 209   testl(retry_count_Reg, retry_count_Reg);
 210   jccb(Assembler::zero, doneRetry);
 211   decrementl(retry_count_Reg);
 212   movptr(scr_Reg, RTMSpinLoopCount);
 213 
 214   bind(SpinLoop);
 215   pause();
 216   decrementl(scr_Reg);
 217   jccb(Assembler::lessEqual, SpinExit);
 218   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 219   testptr(tmp_Reg, tmp_Reg);
 220   jccb(Assembler::notZero, SpinLoop);
 221 
 222   bind(SpinExit);
 223   jmp(retryLabel);
 224   bind(doneRetry);
 225   incrementl(retry_count_Reg); // clear z flag
 226 }
 227 
 228 // Use RTM for normal stack locks
 229 // Input: objReg (object to lock)
 230 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 231                                          Register retry_on_abort_count_Reg,
 232                                          RTMLockingCounters* stack_rtm_counters,
 233                                          Metadata* method_data, bool profile_rtm,
 234                                          Label& DONE_LABEL, Label& IsInflated) {
 235   assert(UseRTMForStackLocks, "why call this otherwise?");
 236   assert(tmpReg == rax, "");
 237   assert(scrReg == rdx, "");
 238   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 239 
 240   if (RTMRetryCount > 0) {
 241     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 242     bind(L_rtm_retry);
 243   }
 244   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 245   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral
 246   jcc(Assembler::notZero, IsInflated);
 247 
 248   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 249     Label L_noincrement;
 250     if (RTMTotalCountIncrRate > 1) {
 251       // tmpReg, scrReg and flags are killed
 252       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 253     }
 254     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
 255     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 256     bind(L_noincrement);
 257   }
 258   xbegin(L_on_abort);
 259   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 260   andptr(tmpReg, markWord::lock_mask_in_place);     // look at 2 lock bits
 261   cmpptr(tmpReg, markWord::unlocked_value);         // bits = 01 unlocked
 262   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 263 
 264   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 265   if (UseRTMXendForLockBusy) {
 266     xend();
 267     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 268     jmp(L_decrement_retry);
 269   }
 270   else {
 271     xabort(0);
 272   }
 273   bind(L_on_abort);
 274   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 275     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 276   }
 277   bind(L_decrement_retry);
 278   if (RTMRetryCount > 0) {
 279     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 280     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 281   }
 282 }
 283 
 284 // Use RTM for inflating locks
 285 // inputs: objReg (object to lock)
 286 //         boxReg (on-stack box address (displaced header location) - KILLED)
 287 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 288 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 289                                             Register scrReg, Register retry_on_busy_count_Reg,
 290                                             Register retry_on_abort_count_Reg,
 291                                             RTMLockingCounters* rtm_counters,
 292                                             Metadata* method_data, bool profile_rtm,
 293                                             Label& DONE_LABEL) {
 294   assert(UseRTMLocking, "why call this otherwise?");
 295   assert(tmpReg == rax, "");
 296   assert(scrReg == rdx, "");
 297   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 298   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 299 
 300   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 301   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
 302   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 303 
 304   if (RTMRetryCount > 0) {
 305     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 306     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 307     bind(L_rtm_retry);
 308   }
 309   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 310     Label L_noincrement;
 311     if (RTMTotalCountIncrRate > 1) {
 312       // tmpReg, scrReg and flags are killed
 313       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 314     }
 315     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 316     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 317     bind(L_noincrement);
 318   }
 319   xbegin(L_on_abort);
 320   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 321   movptr(tmpReg, Address(tmpReg, owner_offset));
 322   testptr(tmpReg, tmpReg);
 323   jcc(Assembler::zero, DONE_LABEL);
 324   if (UseRTMXendForLockBusy) {
 325     xend();
 326     jmp(L_decrement_retry);
 327   }
 328   else {
 329     xabort(0);
 330   }
 331   bind(L_on_abort);
 332   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 333   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 334     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 335   }
 336   if (RTMRetryCount > 0) {
 337     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 338     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 339   }
 340 
 341   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 342   testptr(tmpReg, tmpReg) ;
 343   jccb(Assembler::notZero, L_decrement_retry) ;
 344 
 345   // Appears unlocked - try to swing _owner from null to non-null.
 346   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 347 #ifdef _LP64
 348   Register threadReg = r15_thread;
 349 #else
 350   get_thread(scrReg);
 351   Register threadReg = scrReg;
 352 #endif
 353   lock();
 354   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 355 
 356   if (RTMRetryCount > 0) {
 357     // success done else retry
 358     jccb(Assembler::equal, DONE_LABEL) ;
 359     bind(L_decrement_retry);
 360     // Spin and retry if lock is busy.
 361     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 362   }
 363   else {
 364     bind(L_decrement_retry);
 365   }
 366 }
 367 
 368 #endif //  INCLUDE_RTM_OPT
 369 
 370 // fast_lock and fast_unlock used by C2
 371 
 372 // Because the transitions from emitted code to the runtime
 373 // monitorenter/exit helper stubs are so slow it's critical that
 374 // we inline both the stack-locking fast path and the inflated fast path.
 375 //
 376 // See also: cmpFastLock and cmpFastUnlock.
 377 //
 378 // What follows is a specialized inline transliteration of the code
 379 // in enter() and exit(). If we're concerned about I$ bloat another
 380 // option would be to emit TrySlowEnter and TrySlowExit methods
 381 // at startup-time.  These methods would accept arguments as
 382 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 383 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 384 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 385 // In practice, however, the # of lock sites is bounded and is usually small.
 386 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 387 // if the processor uses simple bimodal branch predictors keyed by EIP
 388 // Since the helper routines would be called from multiple synchronization
 389 // sites.
 390 //
 391 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 392 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 393 // to those specialized methods.  That'd give us a mostly platform-independent
 394 // implementation that the JITs could optimize and inline at their pleasure.
 395 // Done correctly, the only time we'd need to cross to native could would be
 396 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 397 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 398 // (b) explicit barriers or fence operations.
 399 //
 400 // TODO:
 401 //
 402 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 403 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 404 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 405 //    the lock operators would typically be faster than reifying Self.
 406 //
 407 // *  Ideally I'd define the primitives as:
 408 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 409 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 410 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 411 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 412 //    Furthermore the register assignments are overconstrained, possibly resulting in
 413 //    sub-optimal code near the synchronization site.
 414 //
 415 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 416 //    Alternately, use a better sp-proximity test.
 417 //
 418 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 419 //    Either one is sufficient to uniquely identify a thread.
 420 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 421 //
 422 // *  Intrinsify notify() and notifyAll() for the common cases where the
 423 //    object is locked by the calling thread but the waitlist is empty.
 424 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 425 //
 426 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 427 //    But beware of excessive branch density on AMD Opterons.
 428 //
 429 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 430 //    or failure of the fast path.  If the fast path fails then we pass
 431 //    control to the slow path, typically in C.  In fast_lock and
 432 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 433 //    will emit a conditional branch immediately after the node.
 434 //    So we have branches to branches and lots of ICC.ZF games.
 435 //    Instead, it might be better to have C2 pass a "FailureLabel"
 436 //    into fast_lock and fast_unlock.  In the case of success, control
 437 //    will drop through the node.  ICC.ZF is undefined at exit.
 438 //    In the case of failure, the node will branch directly to the
 439 //    FailureLabel
 440 
 441 
 442 // obj: object to lock
 443 // box: on-stack box address (displaced header location) - KILLED
 444 // rax,: tmp -- KILLED
 445 // scr: tmp -- KILLED
 446 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 447                                  Register scrReg, Register cx1Reg, Register cx2Reg,
 448                                  RTMLockingCounters* rtm_counters,
 449                                  RTMLockingCounters* stack_rtm_counters,
 450                                  Metadata* method_data,
 451                                  bool use_rtm, bool profile_rtm) {
 452   // Ensure the register assignments are disjoint
 453   assert(tmpReg == rax, "");
 454 
 455   if (use_rtm) {
 456     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 457   } else {
 458     assert(cx2Reg == noreg, "");
 459     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 460   }
 461 
 462   // Possible cases that we'll encounter in fast_lock
 463   // ------------------------------------------------
 464   // * Inflated
 465   //    -- unlocked
 466   //    -- Locked
 467   //       = by self
 468   //       = by other
 469   // * neutral
 470   // * stack-locked
 471   //    -- by self
 472   //       = sp-proximity test hits
 473   //       = sp-proximity test generates false-negative
 474   //    -- by other
 475   //
 476 
 477   Label IsInflated, DONE_LABEL;
 478 
 479   if (DiagnoseSyncOnValueBasedClasses != 0) {
 480     load_klass(tmpReg, objReg, cx1Reg);
 481     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 482     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 483     jcc(Assembler::notZero, DONE_LABEL);
 484   }
 485 
 486 #if INCLUDE_RTM_OPT
 487   if (UseRTMForStackLocks && use_rtm) {
 488     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 489                       stack_rtm_counters, method_data, profile_rtm,
 490                       DONE_LABEL, IsInflated);
 491   }
 492 #endif // INCLUDE_RTM_OPT
 493 
 494   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 495   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 496   jccb(Assembler::notZero, IsInflated);
 497 
 498   // Attempt stack-locking ...
 499   orptr (tmpReg, markWord::unlocked_value);




 500   movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 501   lock();
 502   cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 503   jcc(Assembler::equal, DONE_LABEL);           // Success
 504 
 505   // Recursive locking.
 506   // The object is stack-locked: markword contains stack pointer to BasicLock.
 507   // Locked by current thread if difference with current SP is less than one page.
 508   subptr(tmpReg, rsp);
 509   // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 510   andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
 511   movptr(Address(boxReg, 0), tmpReg);
 512   jmp(DONE_LABEL);
 513 
 514   bind(IsInflated);
 515   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 516 
 517 #if INCLUDE_RTM_OPT
 518   // Use the same RTM locking code in 32- and 64-bit VM.
 519   if (use_rtm) {
 520     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 521                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 522   } else {
 523 #endif // INCLUDE_RTM_OPT
 524 
 525 #ifndef _LP64
 526   // The object is inflated.
 527 
 528   // boxReg refers to the on-stack BasicLock in the current frame.
 529   // We'd like to write:
 530   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 531   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 532   // additional latency as we have another ST in the store buffer that must drain.
 533 
 534   // avoid ST-before-CAS
 535   // register juggle because we need tmpReg for cmpxchgptr below
 536   movptr(scrReg, boxReg);
 537   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 538 
 539   // Optimistic form: consider XORL tmpReg,tmpReg
 540   movptr(tmpReg, NULL_WORD);
 541 
 542   // Appears unlocked - try to swing _owner from null to non-null.
 543   // Ideally, I'd manifest "Self" with get_thread and then attempt
 544   // to CAS the register containing Self into m->Owner.
 545   // But we don't have enough registers, so instead we can either try to CAS
 546   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 547   // we later store "Self" into m->Owner.  Transiently storing a stack address
 548   // (rsp or the address of the box) into  m->owner is harmless.
 549   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 550   lock();
 551   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 552   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 553   // If we weren't able to swing _owner from NULL to the BasicLock
 554   // then take the slow path.
 555   jccb  (Assembler::notZero, DONE_LABEL);
 556   // update _owner from BasicLock to thread
 557   get_thread (scrReg);                    // beware: clobbers ICCs
 558   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 559   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 560 
 561   // If the CAS fails we can either retry or pass control to the slow path.
 562   // We use the latter tactic.
 563   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 564   // If the CAS was successful ...
 565   //   Self has acquired the lock
 566   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 567   // Intentional fall-through into DONE_LABEL ...
 568 #else // _LP64
 569   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 570   movq(scrReg, tmpReg);
 571   xorq(tmpReg, tmpReg);
 572   lock();
 573   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 574   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 575   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 576   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
 577   // Intentional fall-through into DONE_LABEL ...
 578   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 579 #endif // _LP64
 580 #if INCLUDE_RTM_OPT
 581   } // use_rtm()
 582 #endif
 583   // DONE_LABEL is a hot target - we'd really like to place it at the
 584   // start of cache line by padding with NOPs.
 585   // See the AMD and Intel software optimization manuals for the
 586   // most efficient "long" NOP encodings.
 587   // Unfortunately none of our alignment mechanisms suffice.
 588   bind(DONE_LABEL);
 589 
 590   // At DONE_LABEL the icc ZFlag is set as follows ...
 591   // fast_unlock uses the same protocol.
 592   // ZFlag == 1 -> Success
 593   // ZFlag == 0 -> Failure - force control through the slow path
 594 }
 595 
 596 // obj: object to unlock
 597 // box: box address (displaced header location), killed.  Must be EAX.
 598 // tmp: killed, cannot be obj nor box.
 599 //
 600 // Some commentary on balanced locking:
 601 //
 602 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 603 // Methods that don't have provably balanced locking are forced to run in the
 604 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 605 // The interpreter provides two properties:
 606 // I1:  At return-time the interpreter automatically and quietly unlocks any
 607 //      objects acquired the current activation (frame).  Recall that the
 608 //      interpreter maintains an on-stack list of locks currently held by
 609 //      a frame.
 610 // I2:  If a method attempts to unlock an object that is not held by the
 611 //      the frame the interpreter throws IMSX.
 612 //
 613 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 614 // B() doesn't have provably balanced locking so it runs in the interpreter.
 615 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 616 // is still locked by A().
 617 //
 618 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 619 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 620 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 621 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 622 // Arguably given that the spec legislates the JNI case as undefined our implementation
 623 // could reasonably *avoid* checking owner in fast_unlock().
 624 // In the interest of performance we elide m->Owner==Self check in unlock.
 625 // A perfectly viable alternative is to elide the owner check except when
 626 // Xcheck:jni is enabled.
 627 
 628 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 629   assert(boxReg == rax, "");
 630   assert_different_registers(objReg, boxReg, tmpReg);
 631 
 632   Label DONE_LABEL, Stacked, CheckSucc;
 633 
 634 #if INCLUDE_RTM_OPT
 635   if (UseRTMForStackLocks && use_rtm) {
 636     Label L_regular_unlock;
 637     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 638     andptr(tmpReg, markWord::lock_mask_in_place);                     // look at 2 lock bits
 639     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 01 unlocked
 640     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 641     xend();                                                           // otherwise end...
 642     jmp(DONE_LABEL);                                                  // ... and we're done
 643     bind(L_regular_unlock);
 644   }
 645 #endif
 646 
 647   cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD);                   // Examine the displaced header
 648   jcc   (Assembler::zero, DONE_LABEL);                              // 0 indicates recursive stack-lock
 649   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
 650   testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 651   jccb  (Assembler::zero, Stacked);
 652 
 653   // It's inflated.
 654 #if INCLUDE_RTM_OPT
 655   if (use_rtm) {
 656     Label L_regular_inflated_unlock;
 657     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 658     movptr(boxReg, Address(tmpReg, owner_offset));
 659     testptr(boxReg, boxReg);
 660     jccb(Assembler::notZero, L_regular_inflated_unlock);
 661     xend();
 662     jmpb(DONE_LABEL);
 663     bind(L_regular_inflated_unlock);
 664   }
 665 #endif
 666 
 667   // Despite our balanced locking property we still check that m->_owner == Self
 668   // as java routines or native JNI code called by this thread might
 669   // have released the lock.
 670   // Refer to the comments in synchronizer.cpp for how we might encode extra
 671   // state in _succ so we can avoid fetching EntryList|cxq.
 672   //
 673   // I'd like to add more cases in fast_lock() and fast_unlock() --
 674   // such as recursive enter and exit -- but we have to be wary of
 675   // I$ bloat, T$ effects and BP$ effects.
 676   //
 677   // If there's no contention try a 1-0 exit.  That is, exit without
 678   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 679   // we detect and recover from the race that the 1-0 exit admits.
 680   //
 681   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 682   // before it STs null into _owner, releasing the lock.  Updates
 683   // to data protected by the critical section must be visible before
 684   // we drop the lock (and thus before any other thread could acquire
 685   // the lock and observe the fields protected by the lock).
 686   // IA32's memory-model is SPO, so STs are ordered with respect to
 687   // each other and there's no need for an explicit barrier (fence).
 688   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 689 #ifndef _LP64
 690   get_thread (boxReg);
 691 
 692   // Note that we could employ various encoding schemes to reduce
 693   // the number of loads below (currently 4) to just 2 or 3.
 694   // Refer to the comments in synchronizer.cpp.
 695   // In practice the chain of fetches doesn't seem to impact performance, however.
 696   xorptr(boxReg, boxReg);
 697   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 698   jccb  (Assembler::notZero, DONE_LABEL);
 699   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 700   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 701   jccb  (Assembler::notZero, CheckSucc);
 702   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 703   jmpb  (DONE_LABEL);
 704 
 705   bind (Stacked);
 706   // It's not inflated and it's not recursively stack-locked.
 707   // It must be stack-locked.
 708   // Try to reset the header to displaced header.
 709   // The "box" value on the stack is stable, so we can reload
 710   // and be assured we observe the same value as above.
 711   movptr(tmpReg, Address(boxReg, 0));
 712   lock();
 713   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 714   // Intention fall-thru into DONE_LABEL
 715 
 716   // DONE_LABEL is a hot target - we'd really like to place it at the
 717   // start of cache line by padding with NOPs.
 718   // See the AMD and Intel software optimization manuals for the
 719   // most efficient "long" NOP encodings.
 720   // Unfortunately none of our alignment mechanisms suffice.
 721   bind (CheckSucc);
 722 #else // _LP64
 723   // It's inflated
 724   xorptr(boxReg, boxReg);
 725   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 726   jccb  (Assembler::notZero, DONE_LABEL);
 727   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 728   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 729   jccb  (Assembler::notZero, CheckSucc);
 730   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 731   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 732   jmpb  (DONE_LABEL);
 733 
 734   // Try to avoid passing control into the slow_path ...
 735   Label LSuccess, LGoSlowPath ;
 736   bind  (CheckSucc);
 737 
 738   // The following optional optimization can be elided if necessary
 739   // Effectively: if (succ == null) goto slow path
 740   // The code reduces the window for a race, however,
 741   // and thus benefits performance.
 742   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 743   jccb  (Assembler::zero, LGoSlowPath);
 744 
 745   xorptr(boxReg, boxReg);
 746   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 747   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 748 
 749   // Memory barrier/fence
 750   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 751   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 752   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 753   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 754   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 755   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 756   lock(); addl(Address(rsp, 0), 0);
 757 
 758   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 759   jccb  (Assembler::notZero, LSuccess);
 760 
 761   // Rare inopportune interleaving - race.
 762   // The successor vanished in the small window above.
 763   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 764   // We need to ensure progress and succession.
 765   // Try to reacquire the lock.
 766   // If that fails then the new owner is responsible for succession and this
 767   // thread needs to take no further action and can exit via the fast path (success).
 768   // If the re-acquire succeeds then pass control into the slow path.
 769   // As implemented, this latter mode is horrible because we generated more
 770   // coherence traffic on the lock *and* artifically extended the critical section
 771   // length while by virtue of passing control into the slow path.
 772 
 773   // box is really RAX -- the following CMPXCHG depends on that binding
 774   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 775   lock();
 776   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 777   // There's no successor so we tried to regrab the lock.
 778   // If that didn't work, then another thread grabbed the
 779   // lock so we're done (and exit was a success).
 780   jccb  (Assembler::notEqual, LSuccess);
 781   // Intentional fall-through into slow path
 782 
 783   bind  (LGoSlowPath);
 784   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 785   jmpb  (DONE_LABEL);
 786 
 787   bind  (LSuccess);
 788   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 789   jmpb  (DONE_LABEL);
 790 
 791   bind  (Stacked);
 792   movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 793   lock();
 794   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 795 
 796 #endif
 797   bind(DONE_LABEL);
 798 }
 799 
 800 //-------------------------------------------------------------------------------------------
 801 // Generic instructions support for use in .ad files C2 code generation
 802 
 803 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
 804   if (dst != src) {
 805     movdqu(dst, src);
 806   }
 807   if (opcode == Op_AbsVD) {
 808     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr);
 809   } else {
 810     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 811     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr);
 812   }
 813 }
 814 
 815 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
 816   if (opcode == Op_AbsVD) {
 817     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr);
 818   } else {
 819     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 820     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr);
 821   }
 822 }
 823 
 824 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
 825   if (dst != src) {
 826     movdqu(dst, src);
 827   }
 828   if (opcode == Op_AbsVF) {
 829     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);
 830   } else {
 831     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 832     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);
 833   }
 834 }
 835 
 836 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
 837   if (opcode == Op_AbsVF) {
 838     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);
 839   } else {
 840     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 841     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
 842   }
 843 }
 844 
 845 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 846   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 847   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 848 
 849   if (opcode == Op_MinV) {
 850     if (elem_bt == T_BYTE) {
 851       pminsb(dst, src);
 852     } else if (elem_bt == T_SHORT) {
 853       pminsw(dst, src);
 854     } else if (elem_bt == T_INT) {
 855       pminsd(dst, src);
 856     } else {
 857       assert(elem_bt == T_LONG, "required");
 858       assert(tmp == xmm0, "required");
 859       assert_different_registers(dst, src, tmp);
 860       movdqu(xmm0, dst);
 861       pcmpgtq(xmm0, src);
 862       blendvpd(dst, src);  // xmm0 as mask
 863     }
 864   } else { // opcode == Op_MaxV
 865     if (elem_bt == T_BYTE) {
 866       pmaxsb(dst, src);
 867     } else if (elem_bt == T_SHORT) {
 868       pmaxsw(dst, src);
 869     } else if (elem_bt == T_INT) {
 870       pmaxsd(dst, src);
 871     } else {
 872       assert(elem_bt == T_LONG, "required");
 873       assert(tmp == xmm0, "required");
 874       assert_different_registers(dst, src, tmp);
 875       movdqu(xmm0, src);
 876       pcmpgtq(xmm0, dst);
 877       blendvpd(dst, src);  // xmm0 as mask
 878     }
 879   }
 880 }
 881 
 882 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 883                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 884                                  int vlen_enc) {
 885   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 886 
 887   if (opcode == Op_MinV) {
 888     if (elem_bt == T_BYTE) {
 889       vpminsb(dst, src1, src2, vlen_enc);
 890     } else if (elem_bt == T_SHORT) {
 891       vpminsw(dst, src1, src2, vlen_enc);
 892     } else if (elem_bt == T_INT) {
 893       vpminsd(dst, src1, src2, vlen_enc);
 894     } else {
 895       assert(elem_bt == T_LONG, "required");
 896       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 897         vpminsq(dst, src1, src2, vlen_enc);
 898       } else {
 899         assert_different_registers(dst, src1, src2);
 900         vpcmpgtq(dst, src1, src2, vlen_enc);
 901         vblendvpd(dst, src1, src2, dst, vlen_enc);
 902       }
 903     }
 904   } else { // opcode == Op_MaxV
 905     if (elem_bt == T_BYTE) {
 906       vpmaxsb(dst, src1, src2, vlen_enc);
 907     } else if (elem_bt == T_SHORT) {
 908       vpmaxsw(dst, src1, src2, vlen_enc);
 909     } else if (elem_bt == T_INT) {
 910       vpmaxsd(dst, src1, src2, vlen_enc);
 911     } else {
 912       assert(elem_bt == T_LONG, "required");
 913       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 914         vpmaxsq(dst, src1, src2, vlen_enc);
 915       } else {
 916         assert_different_registers(dst, src1, src2);
 917         vpcmpgtq(dst, src1, src2, vlen_enc);
 918         vblendvpd(dst, src2, src1, dst, vlen_enc);
 919       }
 920     }
 921   }
 922 }
 923 
 924 // Float/Double min max
 925 
 926 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 927                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 928                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 929                                    int vlen_enc) {
 930   assert(UseAVX > 0, "required");
 931   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 932          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 933   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 934   assert_different_registers(a, b, tmp, atmp, btmp);
 935 
 936   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 937   bool is_double_word = is_double_word_type(elem_bt);
 938 
 939   if (!is_double_word && is_min) {
 940     vblendvps(atmp, a, b, a, vlen_enc);
 941     vblendvps(btmp, b, a, a, vlen_enc);
 942     vminps(tmp, atmp, btmp, vlen_enc);
 943     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 944     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
 945   } else if (!is_double_word && !is_min) {
 946     vblendvps(btmp, b, a, b, vlen_enc);
 947     vblendvps(atmp, a, b, b, vlen_enc);
 948     vmaxps(tmp, atmp, btmp, vlen_enc);
 949     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 950     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
 951   } else if (is_double_word && is_min) {
 952     vblendvpd(atmp, a, b, a, vlen_enc);
 953     vblendvpd(btmp, b, a, a, vlen_enc);
 954     vminpd(tmp, atmp, btmp, vlen_enc);
 955     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 956     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
 957   } else {
 958     assert(is_double_word && !is_min, "sanity");
 959     vblendvpd(btmp, b, a, b, vlen_enc);
 960     vblendvpd(atmp, a, b, b, vlen_enc);
 961     vmaxpd(tmp, atmp, btmp, vlen_enc);
 962     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 963     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
 964   }
 965 }
 966 
 967 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
 968                                     XMMRegister dst, XMMRegister a, XMMRegister b,
 969                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
 970                                     int vlen_enc) {
 971   assert(UseAVX > 2, "required");
 972   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 973          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 974   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 975   assert_different_registers(dst, a, b, atmp, btmp);
 976 
 977   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 978   bool is_double_word = is_double_word_type(elem_bt);
 979   bool merge = true;
 980 
 981   if (!is_double_word && is_min) {
 982     evpmovd2m(ktmp, a, vlen_enc);
 983     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
 984     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
 985     vminps(dst, atmp, btmp, vlen_enc);
 986     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 987     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
 988   } else if (!is_double_word && !is_min) {
 989     evpmovd2m(ktmp, b, vlen_enc);
 990     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
 991     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
 992     vmaxps(dst, atmp, btmp, vlen_enc);
 993     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 994     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
 995   } else if (is_double_word && is_min) {
 996     evpmovq2m(ktmp, a, vlen_enc);
 997     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
 998     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
 999     vminpd(dst, atmp, btmp, vlen_enc);
1000     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1001     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1002   } else {
1003     assert(is_double_word && !is_min, "sanity");
1004     evpmovq2m(ktmp, b, vlen_enc);
1005     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1006     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1007     vmaxpd(dst, atmp, btmp, vlen_enc);
1008     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1009     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1010   }
1011 }
1012 
1013 // Float/Double signum
1014 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst,
1015                                   XMMRegister zero, XMMRegister one,
1016                                   Register scratch) {
1017   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1018 
1019   Label DONE_LABEL;
1020 
1021   if (opcode == Op_SignumF) {
1022     assert(UseSSE > 0, "required");
1023     ucomiss(dst, zero);
1024     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1025     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1026     movflt(dst, one);
1027     jcc(Assembler::above, DONE_LABEL);
1028     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scratch);
1029   } else if (opcode == Op_SignumD) {
1030     assert(UseSSE > 1, "required");
1031     ucomisd(dst, zero);
1032     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1033     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1034     movdbl(dst, one);
1035     jcc(Assembler::above, DONE_LABEL);
1036     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scratch);
1037   }
1038 
1039   bind(DONE_LABEL);
1040 }
1041 
1042 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1043   if (sign) {
1044     pmovsxbw(dst, src);
1045   } else {
1046     pmovzxbw(dst, src);
1047   }
1048 }
1049 
1050 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1051   if (sign) {
1052     vpmovsxbw(dst, src, vector_len);
1053   } else {
1054     vpmovzxbw(dst, src, vector_len);
1055   }
1056 }
1057 
1058 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1059   if (sign) {
1060     vpmovsxbd(dst, src, vector_len);
1061   } else {
1062     vpmovzxbd(dst, src, vector_len);
1063   }
1064 }
1065 
1066 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1067   if (sign) {
1068     vpmovsxwd(dst, src, vector_len);
1069   } else {
1070     vpmovzxwd(dst, src, vector_len);
1071   }
1072 }
1073 
1074 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1075                                      int shift, int vector_len) {
1076   if (opcode == Op_RotateLeftV) {
1077     if (etype == T_INT) {
1078       evprold(dst, src, shift, vector_len);
1079     } else {
1080       assert(etype == T_LONG, "expected type T_LONG");
1081       evprolq(dst, src, shift, vector_len);
1082     }
1083   } else {
1084     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1085     if (etype == T_INT) {
1086       evprord(dst, src, shift, vector_len);
1087     } else {
1088       assert(etype == T_LONG, "expected type T_LONG");
1089       evprorq(dst, src, shift, vector_len);
1090     }
1091   }
1092 }
1093 
1094 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1095                                      XMMRegister shift, int vector_len) {
1096   if (opcode == Op_RotateLeftV) {
1097     if (etype == T_INT) {
1098       evprolvd(dst, src, shift, vector_len);
1099     } else {
1100       assert(etype == T_LONG, "expected type T_LONG");
1101       evprolvq(dst, src, shift, vector_len);
1102     }
1103   } else {
1104     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1105     if (etype == T_INT) {
1106       evprorvd(dst, src, shift, vector_len);
1107     } else {
1108       assert(etype == T_LONG, "expected type T_LONG");
1109       evprorvq(dst, src, shift, vector_len);
1110     }
1111   }
1112 }
1113 
1114 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1115   if (opcode == Op_RShiftVI) {
1116     psrad(dst, shift);
1117   } else if (opcode == Op_LShiftVI) {
1118     pslld(dst, shift);
1119   } else {
1120     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1121     psrld(dst, shift);
1122   }
1123 }
1124 
1125 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1126   switch (opcode) {
1127     case Op_RShiftVI:  psrad(dst, shift); break;
1128     case Op_LShiftVI:  pslld(dst, shift); break;
1129     case Op_URShiftVI: psrld(dst, shift); break;
1130 
1131     default: assert(false, "%s", NodeClassNames[opcode]);
1132   }
1133 }
1134 
1135 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1136   if (opcode == Op_RShiftVI) {
1137     vpsrad(dst, nds, shift, vector_len);
1138   } else if (opcode == Op_LShiftVI) {
1139     vpslld(dst, nds, shift, vector_len);
1140   } else {
1141     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1142     vpsrld(dst, nds, shift, vector_len);
1143   }
1144 }
1145 
1146 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1147   switch (opcode) {
1148     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1149     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1150     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1151 
1152     default: assert(false, "%s", NodeClassNames[opcode]);
1153   }
1154 }
1155 
1156 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1157   switch (opcode) {
1158     case Op_RShiftVB:  // fall-through
1159     case Op_RShiftVS:  psraw(dst, shift); break;
1160 
1161     case Op_LShiftVB:  // fall-through
1162     case Op_LShiftVS:  psllw(dst, shift);   break;
1163 
1164     case Op_URShiftVS: // fall-through
1165     case Op_URShiftVB: psrlw(dst, shift);  break;
1166 
1167     default: assert(false, "%s", NodeClassNames[opcode]);
1168   }
1169 }
1170 
1171 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1172   switch (opcode) {
1173     case Op_RShiftVB:  // fall-through
1174     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1175 
1176     case Op_LShiftVB:  // fall-through
1177     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1178 
1179     case Op_URShiftVS: // fall-through
1180     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1181 
1182     default: assert(false, "%s", NodeClassNames[opcode]);
1183   }
1184 }
1185 
1186 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1187   switch (opcode) {
1188     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1189     case Op_LShiftVL:  psllq(dst, shift); break;
1190     case Op_URShiftVL: psrlq(dst, shift); break;
1191 
1192     default: assert(false, "%s", NodeClassNames[opcode]);
1193   }
1194 }
1195 
1196 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1197   if (opcode == Op_RShiftVL) {
1198     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1199   } else if (opcode == Op_LShiftVL) {
1200     psllq(dst, shift);
1201   } else {
1202     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1203     psrlq(dst, shift);
1204   }
1205 }
1206 
1207 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1208   switch (opcode) {
1209     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1210     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1211     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1212 
1213     default: assert(false, "%s", NodeClassNames[opcode]);
1214   }
1215 }
1216 
1217 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1218   if (opcode == Op_RShiftVL) {
1219     evpsraq(dst, nds, shift, vector_len);
1220   } else if (opcode == Op_LShiftVL) {
1221     vpsllq(dst, nds, shift, vector_len);
1222   } else {
1223     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1224     vpsrlq(dst, nds, shift, vector_len);
1225   }
1226 }
1227 
1228 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1229   switch (opcode) {
1230     case Op_RShiftVB:  // fall-through
1231     case Op_RShiftVS:  // fall-through
1232     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1233 
1234     case Op_LShiftVB:  // fall-through
1235     case Op_LShiftVS:  // fall-through
1236     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1237 
1238     case Op_URShiftVB: // fall-through
1239     case Op_URShiftVS: // fall-through
1240     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1241 
1242     default: assert(false, "%s", NodeClassNames[opcode]);
1243   }
1244 }
1245 
1246 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1247   switch (opcode) {
1248     case Op_RShiftVB:  // fall-through
1249     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1250 
1251     case Op_LShiftVB:  // fall-through
1252     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1253 
1254     case Op_URShiftVB: // fall-through
1255     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1256 
1257     default: assert(false, "%s", NodeClassNames[opcode]);
1258   }
1259 }
1260 
1261 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1262   assert(UseAVX >= 2, "required");
1263   switch (opcode) {
1264     case Op_RShiftVL: {
1265       if (UseAVX > 2) {
1266         assert(tmp == xnoreg, "not used");
1267         if (!VM_Version::supports_avx512vl()) {
1268           vlen_enc = Assembler::AVX_512bit;
1269         }
1270         evpsravq(dst, src, shift, vlen_enc);
1271       } else {
1272         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1273         vpsrlvq(dst, src, shift, vlen_enc);
1274         vpsrlvq(tmp, tmp, shift, vlen_enc);
1275         vpxor(dst, dst, tmp, vlen_enc);
1276         vpsubq(dst, dst, tmp, vlen_enc);
1277       }
1278       break;
1279     }
1280     case Op_LShiftVL: {
1281       assert(tmp == xnoreg, "not used");
1282       vpsllvq(dst, src, shift, vlen_enc);
1283       break;
1284     }
1285     case Op_URShiftVL: {
1286       assert(tmp == xnoreg, "not used");
1287       vpsrlvq(dst, src, shift, vlen_enc);
1288       break;
1289     }
1290     default: assert(false, "%s", NodeClassNames[opcode]);
1291   }
1292 }
1293 
1294 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1295 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1296   assert(opcode == Op_LShiftVB ||
1297          opcode == Op_RShiftVB ||
1298          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1299   bool sign = (opcode != Op_URShiftVB);
1300   assert(vector_len == 0, "required");
1301   vextendbd(sign, dst, src, 1);
1302   vpmovzxbd(vtmp, shift, 1);
1303   varshiftd(opcode, dst, dst, vtmp, 1);
1304   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch);
1305   vextracti128_high(vtmp, dst);
1306   vpackusdw(dst, dst, vtmp, 0);
1307 }
1308 
1309 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1310 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1311   assert(opcode == Op_LShiftVB ||
1312          opcode == Op_RShiftVB ||
1313          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1314   bool sign = (opcode != Op_URShiftVB);
1315   int ext_vector_len = vector_len + 1;
1316   vextendbw(sign, dst, src, ext_vector_len);
1317   vpmovzxbw(vtmp, shift, ext_vector_len);
1318   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1319   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch);
1320   if (vector_len == 0) {
1321     vextracti128_high(vtmp, dst);
1322     vpackuswb(dst, dst, vtmp, vector_len);
1323   } else {
1324     vextracti64x4_high(vtmp, dst);
1325     vpackuswb(dst, dst, vtmp, vector_len);
1326     vpermq(dst, dst, 0xD8, vector_len);
1327   }
1328 }
1329 
1330 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1331   switch(typ) {
1332     case T_BYTE:
1333       pinsrb(dst, val, idx);
1334       break;
1335     case T_SHORT:
1336       pinsrw(dst, val, idx);
1337       break;
1338     case T_INT:
1339       pinsrd(dst, val, idx);
1340       break;
1341     case T_LONG:
1342       pinsrq(dst, val, idx);
1343       break;
1344     default:
1345       assert(false,"Should not reach here.");
1346       break;
1347   }
1348 }
1349 
1350 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1351   switch(typ) {
1352     case T_BYTE:
1353       vpinsrb(dst, src, val, idx);
1354       break;
1355     case T_SHORT:
1356       vpinsrw(dst, src, val, idx);
1357       break;
1358     case T_INT:
1359       vpinsrd(dst, src, val, idx);
1360       break;
1361     case T_LONG:
1362       vpinsrq(dst, src, val, idx);
1363       break;
1364     default:
1365       assert(false,"Should not reach here.");
1366       break;
1367   }
1368 }
1369 
1370 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1371   switch(typ) {
1372     case T_INT:
1373       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1374       break;
1375     case T_FLOAT:
1376       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1377       break;
1378     case T_LONG:
1379       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1380       break;
1381     case T_DOUBLE:
1382       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1383       break;
1384     default:
1385       assert(false,"Should not reach here.");
1386       break;
1387   }
1388 }
1389 
1390 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1391   switch(typ) {
1392     case T_INT:
1393       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1394       break;
1395     case T_FLOAT:
1396       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1397       break;
1398     case T_LONG:
1399       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1400       break;
1401     case T_DOUBLE:
1402       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1403       break;
1404     default:
1405       assert(false,"Should not reach here.");
1406       break;
1407   }
1408 }
1409 
1410 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1411   switch(typ) {
1412     case T_INT:
1413       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1414       break;
1415     case T_FLOAT:
1416       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1417       break;
1418     case T_LONG:
1419       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1420       break;
1421     case T_DOUBLE:
1422       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1423       break;
1424     default:
1425       assert(false,"Should not reach here.");
1426       break;
1427   }
1428 }
1429 
1430 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1431   if (vlen_in_bytes <= 16) {
1432     pxor (dst, dst);
1433     psubb(dst, src);
1434     switch (elem_bt) {
1435       case T_BYTE:   /* nothing to do */ break;
1436       case T_SHORT:  pmovsxbw(dst, dst); break;
1437       case T_INT:    pmovsxbd(dst, dst); break;
1438       case T_FLOAT:  pmovsxbd(dst, dst); break;
1439       case T_LONG:   pmovsxbq(dst, dst); break;
1440       case T_DOUBLE: pmovsxbq(dst, dst); break;
1441 
1442       default: assert(false, "%s", type2name(elem_bt));
1443     }
1444   } else {
1445     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1446     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1447 
1448     vpxor (dst, dst, dst, vlen_enc);
1449     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1450 
1451     switch (elem_bt) {
1452       case T_BYTE:   /* nothing to do */            break;
1453       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1454       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1455       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1456       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1457       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1458 
1459       default: assert(false, "%s", type2name(elem_bt));
1460     }
1461   }
1462 }
1463 
1464 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) {
1465   ExternalAddress addr(StubRoutines::x86::vector_iota_indices());
1466   if (vlen_in_bytes == 4) {
1467     movdl(dst, addr);
1468   } else if (vlen_in_bytes == 8) {
1469     movq(dst, addr);
1470   } else if (vlen_in_bytes == 16) {
1471     movdqu(dst, addr, scratch);
1472   } else if (vlen_in_bytes == 32) {
1473     vmovdqu(dst, addr, scratch);
1474   } else {
1475     assert(vlen_in_bytes == 64, "%d", vlen_in_bytes);
1476     evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch);
1477   }
1478 }
1479 
1480 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1481 
1482 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1483   int vector_len = Assembler::AVX_128bit;
1484 
1485   switch (opcode) {
1486     case Op_AndReductionV:  pand(dst, src); break;
1487     case Op_OrReductionV:   por (dst, src); break;
1488     case Op_XorReductionV:  pxor(dst, src); break;
1489     case Op_MinReductionV:
1490       switch (typ) {
1491         case T_BYTE:        pminsb(dst, src); break;
1492         case T_SHORT:       pminsw(dst, src); break;
1493         case T_INT:         pminsd(dst, src); break;
1494         case T_LONG:        assert(UseAVX > 2, "required");
1495                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1496         default:            assert(false, "wrong type");
1497       }
1498       break;
1499     case Op_MaxReductionV:
1500       switch (typ) {
1501         case T_BYTE:        pmaxsb(dst, src); break;
1502         case T_SHORT:       pmaxsw(dst, src); break;
1503         case T_INT:         pmaxsd(dst, src); break;
1504         case T_LONG:        assert(UseAVX > 2, "required");
1505                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1506         default:            assert(false, "wrong type");
1507       }
1508       break;
1509     case Op_AddReductionVF: addss(dst, src); break;
1510     case Op_AddReductionVD: addsd(dst, src); break;
1511     case Op_AddReductionVI:
1512       switch (typ) {
1513         case T_BYTE:        paddb(dst, src); break;
1514         case T_SHORT:       paddw(dst, src); break;
1515         case T_INT:         paddd(dst, src); break;
1516         default:            assert(false, "wrong type");
1517       }
1518       break;
1519     case Op_AddReductionVL: paddq(dst, src); break;
1520     case Op_MulReductionVF: mulss(dst, src); break;
1521     case Op_MulReductionVD: mulsd(dst, src); break;
1522     case Op_MulReductionVI:
1523       switch (typ) {
1524         case T_SHORT:       pmullw(dst, src); break;
1525         case T_INT:         pmulld(dst, src); break;
1526         default:            assert(false, "wrong type");
1527       }
1528       break;
1529     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1530                             vpmullq(dst, dst, src, vector_len); break;
1531     default:                assert(false, "wrong opcode");
1532   }
1533 }
1534 
1535 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1536   int vector_len = Assembler::AVX_256bit;
1537 
1538   switch (opcode) {
1539     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1540     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1541     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1542     case Op_MinReductionV:
1543       switch (typ) {
1544         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1545         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1546         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1547         case T_LONG:        assert(UseAVX > 2, "required");
1548                             vpminsq(dst, src1, src2, vector_len); break;
1549         default:            assert(false, "wrong type");
1550       }
1551       break;
1552     case Op_MaxReductionV:
1553       switch (typ) {
1554         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1555         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1556         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1557         case T_LONG:        assert(UseAVX > 2, "required");
1558                             vpmaxsq(dst, src1, src2, vector_len); break;
1559         default:            assert(false, "wrong type");
1560       }
1561       break;
1562     case Op_AddReductionVI:
1563       switch (typ) {
1564         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1565         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1566         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1567         default:            assert(false, "wrong type");
1568       }
1569       break;
1570     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1571     case Op_MulReductionVI:
1572       switch (typ) {
1573         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1574         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1575         default:            assert(false, "wrong type");
1576       }
1577       break;
1578     case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
1579     default:                assert(false, "wrong opcode");
1580   }
1581 }
1582 
1583 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1584                                   XMMRegister dst, XMMRegister src,
1585                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1586   switch (opcode) {
1587     case Op_AddReductionVF:
1588     case Op_MulReductionVF:
1589       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1590       break;
1591 
1592     case Op_AddReductionVD:
1593     case Op_MulReductionVD:
1594       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1595       break;
1596 
1597     default: assert(false, "wrong opcode");
1598   }
1599 }
1600 
1601 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1602                              Register dst, Register src1, XMMRegister src2,
1603                              XMMRegister vtmp1, XMMRegister vtmp2) {
1604   switch (vlen) {
1605     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1606     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1607     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1608     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1609 
1610     default: assert(false, "wrong vector length");
1611   }
1612 }
1613 
1614 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1615                              Register dst, Register src1, XMMRegister src2,
1616                              XMMRegister vtmp1, XMMRegister vtmp2) {
1617   switch (vlen) {
1618     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1619     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1620     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1621     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1622 
1623     default: assert(false, "wrong vector length");
1624   }
1625 }
1626 
1627 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1628                              Register dst, Register src1, XMMRegister src2,
1629                              XMMRegister vtmp1, XMMRegister vtmp2) {
1630   switch (vlen) {
1631     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1632     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1633     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1634     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1635 
1636     default: assert(false, "wrong vector length");
1637   }
1638 }
1639 
1640 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1641                              Register dst, Register src1, XMMRegister src2,
1642                              XMMRegister vtmp1, XMMRegister vtmp2) {
1643   switch (vlen) {
1644     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1645     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1646     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1647     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1648 
1649     default: assert(false, "wrong vector length");
1650   }
1651 }
1652 
1653 #ifdef _LP64
1654 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1655                              Register dst, Register src1, XMMRegister src2,
1656                              XMMRegister vtmp1, XMMRegister vtmp2) {
1657   switch (vlen) {
1658     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1659     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1660     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1661 
1662     default: assert(false, "wrong vector length");
1663   }
1664 }
1665 #endif // _LP64
1666 
1667 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1668   switch (vlen) {
1669     case 2:
1670       assert(vtmp2 == xnoreg, "");
1671       reduce2F(opcode, dst, src, vtmp1);
1672       break;
1673     case 4:
1674       assert(vtmp2 == xnoreg, "");
1675       reduce4F(opcode, dst, src, vtmp1);
1676       break;
1677     case 8:
1678       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1679       break;
1680     case 16:
1681       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1682       break;
1683     default: assert(false, "wrong vector length");
1684   }
1685 }
1686 
1687 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1688   switch (vlen) {
1689     case 2:
1690       assert(vtmp2 == xnoreg, "");
1691       reduce2D(opcode, dst, src, vtmp1);
1692       break;
1693     case 4:
1694       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1695       break;
1696     case 8:
1697       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1698       break;
1699     default: assert(false, "wrong vector length");
1700   }
1701 }
1702 
1703 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1704   if (opcode == Op_AddReductionVI) {
1705     if (vtmp1 != src2) {
1706       movdqu(vtmp1, src2);
1707     }
1708     phaddd(vtmp1, vtmp1);
1709   } else {
1710     pshufd(vtmp1, src2, 0x1);
1711     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1712   }
1713   movdl(vtmp2, src1);
1714   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1715   movdl(dst, vtmp1);
1716 }
1717 
1718 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1719   if (opcode == Op_AddReductionVI) {
1720     if (vtmp1 != src2) {
1721       movdqu(vtmp1, src2);
1722     }
1723     phaddd(vtmp1, src2);
1724     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1725   } else {
1726     pshufd(vtmp2, src2, 0xE);
1727     reduce_operation_128(T_INT, opcode, vtmp2, src2);
1728     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1729   }
1730 }
1731 
1732 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1733   if (opcode == Op_AddReductionVI) {
1734     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1735     vextracti128_high(vtmp2, vtmp1);
1736     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1737     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1738   } else {
1739     vextracti128_high(vtmp1, src2);
1740     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1741     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1742   }
1743 }
1744 
1745 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1746   vextracti64x4_high(vtmp2, src2);
1747   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
1748   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1749 }
1750 
1751 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1752   pshufd(vtmp2, src2, 0x1);
1753   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1754   movdqu(vtmp1, vtmp2);
1755   psrldq(vtmp1, 2);
1756   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1757   movdqu(vtmp2, vtmp1);
1758   psrldq(vtmp2, 1);
1759   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1760   movdl(vtmp2, src1);
1761   pmovsxbd(vtmp1, vtmp1);
1762   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1763   pextrb(dst, vtmp1, 0x0);
1764   movsbl(dst, dst);
1765 }
1766 
1767 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1768   pshufd(vtmp1, src2, 0xE);
1769   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
1770   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1771 }
1772 
1773 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1774   vextracti128_high(vtmp2, src2);
1775   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1776   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1777 }
1778 
1779 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1780   vextracti64x4_high(vtmp1, src2);
1781   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
1782   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1783 }
1784 
1785 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1786   pmovsxbw(vtmp2, src2);
1787   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1788 }
1789 
1790 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1791   if (UseAVX > 1) {
1792     int vector_len = Assembler::AVX_256bit;
1793     vpmovsxbw(vtmp1, src2, vector_len);
1794     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1795   } else {
1796     pmovsxbw(vtmp2, src2);
1797     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1798     pshufd(vtmp2, src2, 0x1);
1799     pmovsxbw(vtmp2, src2);
1800     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1801   }
1802 }
1803 
1804 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1805   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
1806     int vector_len = Assembler::AVX_512bit;
1807     vpmovsxbw(vtmp1, src2, vector_len);
1808     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1809   } else {
1810     assert(UseAVX >= 2,"Should not reach here.");
1811     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
1812     vextracti128_high(vtmp2, src2);
1813     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1814   }
1815 }
1816 
1817 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1818   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
1819   vextracti64x4_high(vtmp2, src2);
1820   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1821 }
1822 
1823 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1824   if (opcode == Op_AddReductionVI) {
1825     if (vtmp1 != src2) {
1826       movdqu(vtmp1, src2);
1827     }
1828     phaddw(vtmp1, vtmp1);
1829     phaddw(vtmp1, vtmp1);
1830   } else {
1831     pshufd(vtmp2, src2, 0x1);
1832     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1833     movdqu(vtmp1, vtmp2);
1834     psrldq(vtmp1, 2);
1835     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
1836   }
1837   movdl(vtmp2, src1);
1838   pmovsxwd(vtmp1, vtmp1);
1839   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1840   pextrw(dst, vtmp1, 0x0);
1841   movswl(dst, dst);
1842 }
1843 
1844 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1845   if (opcode == Op_AddReductionVI) {
1846     if (vtmp1 != src2) {
1847       movdqu(vtmp1, src2);
1848     }
1849     phaddw(vtmp1, src2);
1850   } else {
1851     pshufd(vtmp1, src2, 0xE);
1852     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
1853   }
1854   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1855 }
1856 
1857 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1858   if (opcode == Op_AddReductionVI) {
1859     int vector_len = Assembler::AVX_256bit;
1860     vphaddw(vtmp2, src2, src2, vector_len);
1861     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
1862   } else {
1863     vextracti128_high(vtmp2, src2);
1864     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1865   }
1866   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1867 }
1868 
1869 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1870   int vector_len = Assembler::AVX_256bit;
1871   vextracti64x4_high(vtmp1, src2);
1872   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
1873   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1874 }
1875 
1876 #ifdef _LP64
1877 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1878   pshufd(vtmp2, src2, 0xE);
1879   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
1880   movdq(vtmp1, src1);
1881   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
1882   movdq(dst, vtmp1);
1883 }
1884 
1885 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1886   vextracti128_high(vtmp1, src2);
1887   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
1888   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1889 }
1890 
1891 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1892   vextracti64x4_high(vtmp2, src2);
1893   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
1894   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1895 }
1896 
1897 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
1898   assert(ArrayOperationPartialInlineSize > 0 && ArrayOperationPartialInlineSize <= 64, "invalid");
1899   mov64(temp, -1L);
1900   bzhiq(temp, temp, len);
1901   kmovql(dst, temp);
1902 }
1903 #endif // _LP64
1904 
1905 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1906   reduce_operation_128(T_FLOAT, opcode, dst, src);
1907   pshufd(vtmp, src, 0x1);
1908   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1909 }
1910 
1911 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1912   reduce2F(opcode, dst, src, vtmp);
1913   pshufd(vtmp, src, 0x2);
1914   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1915   pshufd(vtmp, src, 0x3);
1916   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1917 }
1918 
1919 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1920   reduce4F(opcode, dst, src, vtmp2);
1921   vextractf128_high(vtmp2, src);
1922   reduce4F(opcode, dst, vtmp2, vtmp1);
1923 }
1924 
1925 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1926   reduce8F(opcode, dst, src, vtmp1, vtmp2);
1927   vextracti64x4_high(vtmp1, src);
1928   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
1929 }
1930 
1931 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1932   reduce_operation_128(T_DOUBLE, opcode, dst, src);
1933   pshufd(vtmp, src, 0xE);
1934   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
1935 }
1936 
1937 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1938   reduce2D(opcode, dst, src, vtmp2);
1939   vextractf128_high(vtmp2, src);
1940   reduce2D(opcode, dst, vtmp2, vtmp1);
1941 }
1942 
1943 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1944   reduce4D(opcode, dst, src, vtmp1, vtmp2);
1945   vextracti64x4_high(vtmp1, src);
1946   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
1947 }
1948 
1949 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) {
1950   MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
1951 }
1952 
1953 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) {
1954   MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
1955 }
1956 
1957 
1958 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
1959                                           XMMRegister dst, XMMRegister src,
1960                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1961                                           XMMRegister xmm_0, XMMRegister xmm_1) {
1962   int permconst[] = {1, 14};
1963   XMMRegister wsrc = src;
1964   XMMRegister wdst = xmm_0;
1965   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
1966 
1967   int vlen_enc = Assembler::AVX_128bit;
1968   if (vlen == 16) {
1969     vlen_enc = Assembler::AVX_256bit;
1970   }
1971 
1972   for (int i = log2(vlen) - 1; i >=0; i--) {
1973     if (i == 0 && !is_dst_valid) {
1974       wdst = dst;
1975     }
1976     if (i == 3) {
1977       vextracti64x4_high(wtmp, wsrc);
1978     } else if (i == 2) {
1979       vextracti128_high(wtmp, wsrc);
1980     } else { // i = [0,1]
1981       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
1982     }
1983     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
1984     wsrc = wdst;
1985     vlen_enc = Assembler::AVX_128bit;
1986   }
1987   if (is_dst_valid) {
1988     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
1989   }
1990 }
1991 
1992 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
1993                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1994                                         XMMRegister xmm_0, XMMRegister xmm_1) {
1995   XMMRegister wsrc = src;
1996   XMMRegister wdst = xmm_0;
1997   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
1998   int vlen_enc = Assembler::AVX_128bit;
1999   if (vlen == 8) {
2000     vlen_enc = Assembler::AVX_256bit;
2001   }
2002   for (int i = log2(vlen) - 1; i >=0; i--) {
2003     if (i == 0 && !is_dst_valid) {
2004       wdst = dst;
2005     }
2006     if (i == 1) {
2007       vextracti128_high(wtmp, wsrc);
2008     } else if (i == 2) {
2009       vextracti64x4_high(wtmp, wsrc);
2010     } else {
2011       assert(i == 0, "%d", i);
2012       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2013     }
2014     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2015     wsrc = wdst;
2016     vlen_enc = Assembler::AVX_128bit;
2017   }
2018   if (is_dst_valid) {
2019     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2020   }
2021 }
2022 
2023 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2024   switch (bt) {
2025     case T_BYTE:  pextrb(dst, src, idx); break;
2026     case T_SHORT: pextrw(dst, src, idx); break;
2027     case T_INT:   pextrd(dst, src, idx); break;
2028     case T_LONG:  pextrq(dst, src, idx); break;
2029 
2030     default:
2031       assert(false,"Should not reach here.");
2032       break;
2033   }
2034 }
2035 
2036 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2037   int esize =  type2aelembytes(typ);
2038   int elem_per_lane = 16/esize;
2039   int lane = elemindex / elem_per_lane;
2040   int eindex = elemindex % elem_per_lane;
2041 
2042   if (lane >= 2) {
2043     assert(UseAVX > 2, "required");
2044     vextractf32x4(dst, src, lane & 3);
2045     return dst;
2046   } else if (lane > 0) {
2047     assert(UseAVX > 0, "required");
2048     vextractf128(dst, src, lane);
2049     return dst;
2050   } else {
2051     return src;
2052   }
2053 }
2054 
2055 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2056   int esize =  type2aelembytes(typ);
2057   int elem_per_lane = 16/esize;
2058   int eindex = elemindex % elem_per_lane;
2059   assert(is_integral_type(typ),"required");
2060 
2061   if (eindex == 0) {
2062     if (typ == T_LONG) {
2063       movq(dst, src);
2064     } else {
2065       movdl(dst, src);
2066       if (typ == T_BYTE)
2067         movsbl(dst, dst);
2068       else if (typ == T_SHORT)
2069         movswl(dst, dst);
2070     }
2071   } else {
2072     extract(typ, dst, src, eindex);
2073   }
2074 }
2075 
2076 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) {
2077   int esize =  type2aelembytes(typ);
2078   int elem_per_lane = 16/esize;
2079   int eindex = elemindex % elem_per_lane;
2080   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2081 
2082   if (eindex == 0) {
2083     movq(dst, src);
2084   } else {
2085     if (typ == T_FLOAT) {
2086       if (UseAVX == 0) {
2087         movdqu(dst, src);
2088         pshufps(dst, dst, eindex);
2089       } else {
2090         vpshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2091       }
2092     } else {
2093       if (UseAVX == 0) {
2094         movdqu(dst, src);
2095         psrldq(dst, eindex*esize);
2096       } else {
2097         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2098       }
2099       movq(dst, dst);
2100     }
2101   }
2102   // Zero upper bits
2103   if (typ == T_FLOAT) {
2104     if (UseAVX == 0) {
2105       assert((vtmp != xnoreg) && (tmp != noreg), "required.");
2106       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp);
2107       pand(dst, vtmp);
2108     } else {
2109       assert((tmp != noreg), "required.");
2110       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp);
2111     }
2112   }
2113 }
2114 
2115 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2116   switch(typ) {
2117     case T_BYTE:
2118     case T_BOOLEAN:
2119       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2120       break;
2121     case T_SHORT:
2122     case T_CHAR:
2123       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2124       break;
2125     case T_INT:
2126     case T_FLOAT:
2127       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2128       break;
2129     case T_LONG:
2130     case T_DOUBLE:
2131       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2132       break;
2133     default:
2134       assert(false,"Should not reach here.");
2135       break;
2136   }
2137 }
2138 
2139 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) {
2140   switch(typ) {
2141     case T_BOOLEAN:
2142     case T_BYTE:
2143       evpcmpb(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2144       break;
2145     case T_CHAR:
2146     case T_SHORT:
2147       evpcmpw(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2148       break;
2149     case T_INT:
2150     case T_FLOAT:
2151       evpcmpd(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2152       break;
2153     case T_LONG:
2154     case T_DOUBLE:
2155       evpcmpq(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2156       break;
2157     default:
2158       assert(false,"Should not reach here.");
2159       break;
2160   }
2161 }
2162 
2163 void C2_MacroAssembler::vpcmpu(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison,
2164                             int vlen_in_bytes, XMMRegister vtmp1, XMMRegister vtmp2, Register scratch) {
2165   int vlen_enc = vector_length_encoding(vlen_in_bytes*2);
2166   switch (typ) {
2167   case T_BYTE:
2168     vpmovzxbw(vtmp1, src1, vlen_enc);
2169     vpmovzxbw(vtmp2, src2, vlen_enc);
2170     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2171     vpacksswb(dst, dst, dst, vlen_enc);
2172     break;
2173   case T_SHORT:
2174     vpmovzxwd(vtmp1, src1, vlen_enc);
2175     vpmovzxwd(vtmp2, src2, vlen_enc);
2176     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);
2177     vpackssdw(dst, dst, dst, vlen_enc);
2178     break;
2179   case T_INT:
2180     vpmovzxdq(vtmp1, src1, vlen_enc);
2181     vpmovzxdq(vtmp2, src2, vlen_enc);
2182     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);
2183     vpermilps(dst, dst, 8, vlen_enc);
2184     break;
2185   default:
2186     assert(false, "Should not reach here");
2187   }
2188   if (vlen_in_bytes == 16) {
2189     vpermpd(dst, dst, 0x8, vlen_enc);
2190   }
2191 }
2192 
2193 void C2_MacroAssembler::vpcmpu32(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison, int vlen_in_bytes,
2194                               XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, Register scratch) {
2195   int vlen_enc = vector_length_encoding(vlen_in_bytes);
2196   switch (typ) {
2197   case T_BYTE:
2198     vpmovzxbw(vtmp1, src1, vlen_enc);
2199     vpmovzxbw(vtmp2, src2, vlen_enc);
2200     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2201     vextracti128(vtmp1, src1, 1);
2202     vextracti128(vtmp2, src2, 1);
2203     vpmovzxbw(vtmp1, vtmp1, vlen_enc);
2204     vpmovzxbw(vtmp2, vtmp2, vlen_enc);
2205     vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2206     vpacksswb(dst, dst, vtmp3, vlen_enc);
2207     vpermpd(dst, dst, 0xd8, vlen_enc);
2208     break;
2209   case T_SHORT:
2210     vpmovzxwd(vtmp1, src1, vlen_enc);
2211     vpmovzxwd(vtmp2, src2, vlen_enc);
2212     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);
2213     vextracti128(vtmp1, src1, 1);
2214     vextracti128(vtmp2, src2, 1);
2215     vpmovzxwd(vtmp1, vtmp1, vlen_enc);
2216     vpmovzxwd(vtmp2, vtmp2, vlen_enc);
2217     vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::D,  vlen_enc, scratch);
2218     vpackssdw(dst, dst, vtmp3, vlen_enc);
2219     vpermpd(dst, dst, 0xd8, vlen_enc);
2220     break;
2221   case T_INT:
2222     vpmovzxdq(vtmp1, src1, vlen_enc);
2223     vpmovzxdq(vtmp2, src2, vlen_enc);
2224     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);
2225     vpshufd(dst, dst, 8, vlen_enc);
2226     vpermq(dst, dst, 8, vlen_enc);
2227     vextracti128(vtmp1, src1, 1);
2228     vextracti128(vtmp2, src2, 1);
2229     vpmovzxdq(vtmp1, vtmp1, vlen_enc);
2230     vpmovzxdq(vtmp2, vtmp2, vlen_enc);
2231     vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::Q,  vlen_enc, scratch);
2232     vpshufd(vtmp3, vtmp3, 8, vlen_enc);
2233     vpermq(vtmp3, vtmp3, 0x80, vlen_enc);
2234     vpblendd(dst, dst, vtmp3, 0xf0, vlen_enc);
2235     break;
2236   default:
2237     assert(false, "Should not reach here");
2238   }
2239 }
2240 
2241 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2242   switch(typ) {
2243     case T_BYTE:
2244       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2245       break;
2246     case T_SHORT:
2247       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2248       break;
2249     case T_INT:
2250     case T_FLOAT:
2251       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2252       break;
2253     case T_LONG:
2254     case T_DOUBLE:
2255       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2256       break;
2257     default:
2258       assert(false,"Should not reach here.");
2259       break;
2260   }
2261 }
2262 
2263 void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2,
2264                                    XMMRegister vtmp1, XMMRegister vtmp2, KRegister mask) {
2265   switch(vlen) {
2266     case 4:
2267       assert(vtmp1 != xnoreg, "required.");
2268       // Broadcast lower 32 bits to 128 bits before ptest
2269       pshufd(vtmp1, src1, 0x0);
2270       if (bt == BoolTest::overflow) {
2271         assert(vtmp2 != xnoreg, "required.");
2272         pshufd(vtmp2, src2, 0x0);
2273       } else {
2274         assert(vtmp2 == xnoreg, "required.");
2275         vtmp2 = src2;
2276       }
2277       ptest(vtmp1, vtmp2);
2278      break;
2279     case 8:
2280       assert(vtmp1 != xnoreg, "required.");
2281       // Broadcast lower 64 bits to 128 bits before ptest
2282       pshufd(vtmp1, src1, 0x4);
2283       if (bt == BoolTest::overflow) {
2284         assert(vtmp2 != xnoreg, "required.");
2285         pshufd(vtmp2, src2, 0x4);
2286       } else {
2287         assert(vtmp2 == xnoreg, "required.");
2288         vtmp2 = src2;
2289       }
2290       ptest(vtmp1, vtmp2);
2291      break;
2292     case 16:
2293       assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2294       ptest(src1, src2);
2295       break;
2296     case 32:
2297       assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2298       vptest(src1, src2, Assembler::AVX_256bit);
2299       break;
2300     case 64:
2301       {
2302         assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2303         evpcmpeqb(mask, src1, src2, Assembler::AVX_512bit);
2304         if (bt == BoolTest::ne) {
2305           ktestql(mask, mask);
2306         } else {
2307           assert(bt == BoolTest::overflow, "required");
2308           kortestql(mask, mask);
2309         }
2310       }
2311       break;
2312     default:
2313       assert(false,"Should not reach here.");
2314       break;
2315   }
2316 }
2317 
2318 //-------------------------------------------------------------------------------------------
2319 
2320 // IndexOf for constant substrings with size >= 8 chars
2321 // which don't need to be loaded through stack.
2322 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2323                                          Register cnt1, Register cnt2,
2324                                          int int_cnt2,  Register result,
2325                                          XMMRegister vec, Register tmp,
2326                                          int ae) {
2327   ShortBranchVerifier sbv(this);
2328   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2329   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2330 
2331   // This method uses the pcmpestri instruction with bound registers
2332   //   inputs:
2333   //     xmm - substring
2334   //     rax - substring length (elements count)
2335   //     mem - scanned string
2336   //     rdx - string length (elements count)
2337   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2338   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2339   //   outputs:
2340   //     rcx - matched index in string
2341   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2342   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2343   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2344   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2345   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2346 
2347   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2348         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2349         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2350 
2351   // Note, inline_string_indexOf() generates checks:
2352   // if (substr.count > string.count) return -1;
2353   // if (substr.count == 0) return 0;
2354   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2355 
2356   // Load substring.
2357   if (ae == StrIntrinsicNode::UL) {
2358     pmovzxbw(vec, Address(str2, 0));
2359   } else {
2360     movdqu(vec, Address(str2, 0));
2361   }
2362   movl(cnt2, int_cnt2);
2363   movptr(result, str1); // string addr
2364 
2365   if (int_cnt2 > stride) {
2366     jmpb(SCAN_TO_SUBSTR);
2367 
2368     // Reload substr for rescan, this code
2369     // is executed only for large substrings (> 8 chars)
2370     bind(RELOAD_SUBSTR);
2371     if (ae == StrIntrinsicNode::UL) {
2372       pmovzxbw(vec, Address(str2, 0));
2373     } else {
2374       movdqu(vec, Address(str2, 0));
2375     }
2376     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2377 
2378     bind(RELOAD_STR);
2379     // We came here after the beginning of the substring was
2380     // matched but the rest of it was not so we need to search
2381     // again. Start from the next element after the previous match.
2382 
2383     // cnt2 is number of substring reminding elements and
2384     // cnt1 is number of string reminding elements when cmp failed.
2385     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2386     subl(cnt1, cnt2);
2387     addl(cnt1, int_cnt2);
2388     movl(cnt2, int_cnt2); // Now restore cnt2
2389 
2390     decrementl(cnt1);     // Shift to next element
2391     cmpl(cnt1, cnt2);
2392     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2393 
2394     addptr(result, (1<<scale1));
2395 
2396   } // (int_cnt2 > 8)
2397 
2398   // Scan string for start of substr in 16-byte vectors
2399   bind(SCAN_TO_SUBSTR);
2400   pcmpestri(vec, Address(result, 0), mode);
2401   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2402   subl(cnt1, stride);
2403   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2404   cmpl(cnt1, cnt2);
2405   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2406   addptr(result, 16);
2407   jmpb(SCAN_TO_SUBSTR);
2408 
2409   // Found a potential substr
2410   bind(FOUND_CANDIDATE);
2411   // Matched whole vector if first element matched (tmp(rcx) == 0).
2412   if (int_cnt2 == stride) {
2413     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2414   } else { // int_cnt2 > 8
2415     jccb(Assembler::overflow, FOUND_SUBSTR);
2416   }
2417   // After pcmpestri tmp(rcx) contains matched element index
2418   // Compute start addr of substr
2419   lea(result, Address(result, tmp, scale1));
2420 
2421   // Make sure string is still long enough
2422   subl(cnt1, tmp);
2423   cmpl(cnt1, cnt2);
2424   if (int_cnt2 == stride) {
2425     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2426   } else { // int_cnt2 > 8
2427     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2428   }
2429   // Left less then substring.
2430 
2431   bind(RET_NOT_FOUND);
2432   movl(result, -1);
2433   jmp(EXIT);
2434 
2435   if (int_cnt2 > stride) {
2436     // This code is optimized for the case when whole substring
2437     // is matched if its head is matched.
2438     bind(MATCH_SUBSTR_HEAD);
2439     pcmpestri(vec, Address(result, 0), mode);
2440     // Reload only string if does not match
2441     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2442 
2443     Label CONT_SCAN_SUBSTR;
2444     // Compare the rest of substring (> 8 chars).
2445     bind(FOUND_SUBSTR);
2446     // First 8 chars are already matched.
2447     negptr(cnt2);
2448     addptr(cnt2, stride);
2449 
2450     bind(SCAN_SUBSTR);
2451     subl(cnt1, stride);
2452     cmpl(cnt2, -stride); // Do not read beyond substring
2453     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2454     // Back-up strings to avoid reading beyond substring:
2455     // cnt1 = cnt1 - cnt2 + 8
2456     addl(cnt1, cnt2); // cnt2 is negative
2457     addl(cnt1, stride);
2458     movl(cnt2, stride); negptr(cnt2);
2459     bind(CONT_SCAN_SUBSTR);
2460     if (int_cnt2 < (int)G) {
2461       int tail_off1 = int_cnt2<<scale1;
2462       int tail_off2 = int_cnt2<<scale2;
2463       if (ae == StrIntrinsicNode::UL) {
2464         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2465       } else {
2466         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2467       }
2468       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2469     } else {
2470       // calculate index in register to avoid integer overflow (int_cnt2*2)
2471       movl(tmp, int_cnt2);
2472       addptr(tmp, cnt2);
2473       if (ae == StrIntrinsicNode::UL) {
2474         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2475       } else {
2476         movdqu(vec, Address(str2, tmp, scale2, 0));
2477       }
2478       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2479     }
2480     // Need to reload strings pointers if not matched whole vector
2481     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2482     addptr(cnt2, stride);
2483     jcc(Assembler::negative, SCAN_SUBSTR);
2484     // Fall through if found full substring
2485 
2486   } // (int_cnt2 > 8)
2487 
2488   bind(RET_FOUND);
2489   // Found result if we matched full small substring.
2490   // Compute substr offset
2491   subptr(result, str1);
2492   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2493     shrl(result, 1); // index
2494   }
2495   bind(EXIT);
2496 
2497 } // string_indexofC8
2498 
2499 // Small strings are loaded through stack if they cross page boundary.
2500 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2501                                        Register cnt1, Register cnt2,
2502                                        int int_cnt2,  Register result,
2503                                        XMMRegister vec, Register tmp,
2504                                        int ae) {
2505   ShortBranchVerifier sbv(this);
2506   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2507   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2508 
2509   //
2510   // int_cnt2 is length of small (< 8 chars) constant substring
2511   // or (-1) for non constant substring in which case its length
2512   // is in cnt2 register.
2513   //
2514   // Note, inline_string_indexOf() generates checks:
2515   // if (substr.count > string.count) return -1;
2516   // if (substr.count == 0) return 0;
2517   //
2518   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2519   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2520   // This method uses the pcmpestri instruction with bound registers
2521   //   inputs:
2522   //     xmm - substring
2523   //     rax - substring length (elements count)
2524   //     mem - scanned string
2525   //     rdx - string length (elements count)
2526   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2527   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2528   //   outputs:
2529   //     rcx - matched index in string
2530   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2531   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2532   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2533   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2534 
2535   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2536         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2537         FOUND_CANDIDATE;
2538 
2539   { //========================================================
2540     // We don't know where these strings are located
2541     // and we can't read beyond them. Load them through stack.
2542     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2543 
2544     movptr(tmp, rsp); // save old SP
2545 
2546     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2547       if (int_cnt2 == (1>>scale2)) { // One byte
2548         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2549         load_unsigned_byte(result, Address(str2, 0));
2550         movdl(vec, result); // move 32 bits
2551       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2552         // Not enough header space in 32-bit VM: 12+3 = 15.
2553         movl(result, Address(str2, -1));
2554         shrl(result, 8);
2555         movdl(vec, result); // move 32 bits
2556       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2557         load_unsigned_short(result, Address(str2, 0));
2558         movdl(vec, result); // move 32 bits
2559       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2560         movdl(vec, Address(str2, 0)); // move 32 bits
2561       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2562         movq(vec, Address(str2, 0));  // move 64 bits
2563       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2564         // Array header size is 12 bytes in 32-bit VM
2565         // + 6 bytes for 3 chars == 18 bytes,
2566         // enough space to load vec and shift.
2567         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2568         if (ae == StrIntrinsicNode::UL) {
2569           int tail_off = int_cnt2-8;
2570           pmovzxbw(vec, Address(str2, tail_off));
2571           psrldq(vec, -2*tail_off);
2572         }
2573         else {
2574           int tail_off = int_cnt2*(1<<scale2);
2575           movdqu(vec, Address(str2, tail_off-16));
2576           psrldq(vec, 16-tail_off);
2577         }
2578       }
2579     } else { // not constant substring
2580       cmpl(cnt2, stride);
2581       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2582 
2583       // We can read beyond string if srt+16 does not cross page boundary
2584       // since heaps are aligned and mapped by pages.
2585       assert(os::vm_page_size() < (int)G, "default page should be small");
2586       movl(result, str2); // We need only low 32 bits
2587       andl(result, (os::vm_page_size()-1));
2588       cmpl(result, (os::vm_page_size()-16));
2589       jccb(Assembler::belowEqual, CHECK_STR);
2590 
2591       // Move small strings to stack to allow load 16 bytes into vec.
2592       subptr(rsp, 16);
2593       int stk_offset = wordSize-(1<<scale2);
2594       push(cnt2);
2595 
2596       bind(COPY_SUBSTR);
2597       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2598         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2599         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2600       } else if (ae == StrIntrinsicNode::UU) {
2601         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2602         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2603       }
2604       decrement(cnt2);
2605       jccb(Assembler::notZero, COPY_SUBSTR);
2606 
2607       pop(cnt2);
2608       movptr(str2, rsp);  // New substring address
2609     } // non constant
2610 
2611     bind(CHECK_STR);
2612     cmpl(cnt1, stride);
2613     jccb(Assembler::aboveEqual, BIG_STRINGS);
2614 
2615     // Check cross page boundary.
2616     movl(result, str1); // We need only low 32 bits
2617     andl(result, (os::vm_page_size()-1));
2618     cmpl(result, (os::vm_page_size()-16));
2619     jccb(Assembler::belowEqual, BIG_STRINGS);
2620 
2621     subptr(rsp, 16);
2622     int stk_offset = -(1<<scale1);
2623     if (int_cnt2 < 0) { // not constant
2624       push(cnt2);
2625       stk_offset += wordSize;
2626     }
2627     movl(cnt2, cnt1);
2628 
2629     bind(COPY_STR);
2630     if (ae == StrIntrinsicNode::LL) {
2631       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2632       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2633     } else {
2634       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2635       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2636     }
2637     decrement(cnt2);
2638     jccb(Assembler::notZero, COPY_STR);
2639 
2640     if (int_cnt2 < 0) { // not constant
2641       pop(cnt2);
2642     }
2643     movptr(str1, rsp);  // New string address
2644 
2645     bind(BIG_STRINGS);
2646     // Load substring.
2647     if (int_cnt2 < 0) { // -1
2648       if (ae == StrIntrinsicNode::UL) {
2649         pmovzxbw(vec, Address(str2, 0));
2650       } else {
2651         movdqu(vec, Address(str2, 0));
2652       }
2653       push(cnt2);       // substr count
2654       push(str2);       // substr addr
2655       push(str1);       // string addr
2656     } else {
2657       // Small (< 8 chars) constant substrings are loaded already.
2658       movl(cnt2, int_cnt2);
2659     }
2660     push(tmp);  // original SP
2661 
2662   } // Finished loading
2663 
2664   //========================================================
2665   // Start search
2666   //
2667 
2668   movptr(result, str1); // string addr
2669 
2670   if (int_cnt2  < 0) {  // Only for non constant substring
2671     jmpb(SCAN_TO_SUBSTR);
2672 
2673     // SP saved at sp+0
2674     // String saved at sp+1*wordSize
2675     // Substr saved at sp+2*wordSize
2676     // Substr count saved at sp+3*wordSize
2677 
2678     // Reload substr for rescan, this code
2679     // is executed only for large substrings (> 8 chars)
2680     bind(RELOAD_SUBSTR);
2681     movptr(str2, Address(rsp, 2*wordSize));
2682     movl(cnt2, Address(rsp, 3*wordSize));
2683     if (ae == StrIntrinsicNode::UL) {
2684       pmovzxbw(vec, Address(str2, 0));
2685     } else {
2686       movdqu(vec, Address(str2, 0));
2687     }
2688     // We came here after the beginning of the substring was
2689     // matched but the rest of it was not so we need to search
2690     // again. Start from the next element after the previous match.
2691     subptr(str1, result); // Restore counter
2692     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2693       shrl(str1, 1);
2694     }
2695     addl(cnt1, str1);
2696     decrementl(cnt1);   // Shift to next element
2697     cmpl(cnt1, cnt2);
2698     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2699 
2700     addptr(result, (1<<scale1));
2701   } // non constant
2702 
2703   // Scan string for start of substr in 16-byte vectors
2704   bind(SCAN_TO_SUBSTR);
2705   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2706   pcmpestri(vec, Address(result, 0), mode);
2707   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2708   subl(cnt1, stride);
2709   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2710   cmpl(cnt1, cnt2);
2711   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2712   addptr(result, 16);
2713 
2714   bind(ADJUST_STR);
2715   cmpl(cnt1, stride); // Do not read beyond string
2716   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2717   // Back-up string to avoid reading beyond string.
2718   lea(result, Address(result, cnt1, scale1, -16));
2719   movl(cnt1, stride);
2720   jmpb(SCAN_TO_SUBSTR);
2721 
2722   // Found a potential substr
2723   bind(FOUND_CANDIDATE);
2724   // After pcmpestri tmp(rcx) contains matched element index
2725 
2726   // Make sure string is still long enough
2727   subl(cnt1, tmp);
2728   cmpl(cnt1, cnt2);
2729   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
2730   // Left less then substring.
2731 
2732   bind(RET_NOT_FOUND);
2733   movl(result, -1);
2734   jmp(CLEANUP);
2735 
2736   bind(FOUND_SUBSTR);
2737   // Compute start addr of substr
2738   lea(result, Address(result, tmp, scale1));
2739   if (int_cnt2 > 0) { // Constant substring
2740     // Repeat search for small substring (< 8 chars)
2741     // from new point without reloading substring.
2742     // Have to check that we don't read beyond string.
2743     cmpl(tmp, stride-int_cnt2);
2744     jccb(Assembler::greater, ADJUST_STR);
2745     // Fall through if matched whole substring.
2746   } else { // non constant
2747     assert(int_cnt2 == -1, "should be != 0");
2748 
2749     addl(tmp, cnt2);
2750     // Found result if we matched whole substring.
2751     cmpl(tmp, stride);
2752     jcc(Assembler::lessEqual, RET_FOUND);
2753 
2754     // Repeat search for small substring (<= 8 chars)
2755     // from new point 'str1' without reloading substring.
2756     cmpl(cnt2, stride);
2757     // Have to check that we don't read beyond string.
2758     jccb(Assembler::lessEqual, ADJUST_STR);
2759 
2760     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
2761     // Compare the rest of substring (> 8 chars).
2762     movptr(str1, result);
2763 
2764     cmpl(tmp, cnt2);
2765     // First 8 chars are already matched.
2766     jccb(Assembler::equal, CHECK_NEXT);
2767 
2768     bind(SCAN_SUBSTR);
2769     pcmpestri(vec, Address(str1, 0), mode);
2770     // Need to reload strings pointers if not matched whole vector
2771     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2772 
2773     bind(CHECK_NEXT);
2774     subl(cnt2, stride);
2775     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
2776     addptr(str1, 16);
2777     if (ae == StrIntrinsicNode::UL) {
2778       addptr(str2, 8);
2779     } else {
2780       addptr(str2, 16);
2781     }
2782     subl(cnt1, stride);
2783     cmpl(cnt2, stride); // Do not read beyond substring
2784     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
2785     // Back-up strings to avoid reading beyond substring.
2786 
2787     if (ae == StrIntrinsicNode::UL) {
2788       lea(str2, Address(str2, cnt2, scale2, -8));
2789       lea(str1, Address(str1, cnt2, scale1, -16));
2790     } else {
2791       lea(str2, Address(str2, cnt2, scale2, -16));
2792       lea(str1, Address(str1, cnt2, scale1, -16));
2793     }
2794     subl(cnt1, cnt2);
2795     movl(cnt2, stride);
2796     addl(cnt1, stride);
2797     bind(CONT_SCAN_SUBSTR);
2798     if (ae == StrIntrinsicNode::UL) {
2799       pmovzxbw(vec, Address(str2, 0));
2800     } else {
2801       movdqu(vec, Address(str2, 0));
2802     }
2803     jmp(SCAN_SUBSTR);
2804 
2805     bind(RET_FOUND_LONG);
2806     movptr(str1, Address(rsp, wordSize));
2807   } // non constant
2808 
2809   bind(RET_FOUND);
2810   // Compute substr offset
2811   subptr(result, str1);
2812   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2813     shrl(result, 1); // index
2814   }
2815   bind(CLEANUP);
2816   pop(rsp); // restore SP
2817 
2818 } // string_indexof
2819 
2820 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
2821                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
2822   ShortBranchVerifier sbv(this);
2823   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2824 
2825   int stride = 8;
2826 
2827   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
2828         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
2829         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
2830         FOUND_SEQ_CHAR, DONE_LABEL;
2831 
2832   movptr(result, str1);
2833   if (UseAVX >= 2) {
2834     cmpl(cnt1, stride);
2835     jcc(Assembler::less, SCAN_TO_CHAR);
2836     cmpl(cnt1, 2*stride);
2837     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
2838     movdl(vec1, ch);
2839     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
2840     vpxor(vec2, vec2);
2841     movl(tmp, cnt1);
2842     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
2843     andl(cnt1,0x0000000F);  //tail count (in chars)
2844 
2845     bind(SCAN_TO_16_CHAR_LOOP);
2846     vmovdqu(vec3, Address(result, 0));
2847     vpcmpeqw(vec3, vec3, vec1, 1);
2848     vptest(vec2, vec3);
2849     jcc(Assembler::carryClear, FOUND_CHAR);
2850     addptr(result, 32);
2851     subl(tmp, 2*stride);
2852     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
2853     jmp(SCAN_TO_8_CHAR);
2854     bind(SCAN_TO_8_CHAR_INIT);
2855     movdl(vec1, ch);
2856     pshuflw(vec1, vec1, 0x00);
2857     pshufd(vec1, vec1, 0);
2858     pxor(vec2, vec2);
2859   }
2860   bind(SCAN_TO_8_CHAR);
2861   cmpl(cnt1, stride);
2862   jcc(Assembler::less, SCAN_TO_CHAR);
2863   if (UseAVX < 2) {
2864     movdl(vec1, ch);
2865     pshuflw(vec1, vec1, 0x00);
2866     pshufd(vec1, vec1, 0);
2867     pxor(vec2, vec2);
2868   }
2869   movl(tmp, cnt1);
2870   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
2871   andl(cnt1,0x00000007);  //tail count (in chars)
2872 
2873   bind(SCAN_TO_8_CHAR_LOOP);
2874   movdqu(vec3, Address(result, 0));
2875   pcmpeqw(vec3, vec1);
2876   ptest(vec2, vec3);
2877   jcc(Assembler::carryClear, FOUND_CHAR);
2878   addptr(result, 16);
2879   subl(tmp, stride);
2880   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
2881   bind(SCAN_TO_CHAR);
2882   testl(cnt1, cnt1);
2883   jcc(Assembler::zero, RET_NOT_FOUND);
2884   bind(SCAN_TO_CHAR_LOOP);
2885   load_unsigned_short(tmp, Address(result, 0));
2886   cmpl(ch, tmp);
2887   jccb(Assembler::equal, FOUND_SEQ_CHAR);
2888   addptr(result, 2);
2889   subl(cnt1, 1);
2890   jccb(Assembler::zero, RET_NOT_FOUND);
2891   jmp(SCAN_TO_CHAR_LOOP);
2892 
2893   bind(RET_NOT_FOUND);
2894   movl(result, -1);
2895   jmpb(DONE_LABEL);
2896 
2897   bind(FOUND_CHAR);
2898   if (UseAVX >= 2) {
2899     vpmovmskb(tmp, vec3);
2900   } else {
2901     pmovmskb(tmp, vec3);
2902   }
2903   bsfl(ch, tmp);
2904   addptr(result, ch);
2905 
2906   bind(FOUND_SEQ_CHAR);
2907   subptr(result, str1);
2908   shrl(result, 1);
2909 
2910   bind(DONE_LABEL);
2911 } // string_indexof_char
2912 
2913 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
2914                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
2915   ShortBranchVerifier sbv(this);
2916   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2917 
2918   int stride = 16;
2919 
2920   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
2921         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
2922         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
2923         FOUND_SEQ_CHAR, DONE_LABEL;
2924 
2925   movptr(result, str1);
2926   if (UseAVX >= 2) {
2927     cmpl(cnt1, stride);
2928     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
2929     cmpl(cnt1, stride*2);
2930     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
2931     movdl(vec1, ch);
2932     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
2933     vpxor(vec2, vec2);
2934     movl(tmp, cnt1);
2935     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
2936     andl(cnt1,0x0000001F);  //tail count (in chars)
2937 
2938     bind(SCAN_TO_32_CHAR_LOOP);
2939     vmovdqu(vec3, Address(result, 0));
2940     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
2941     vptest(vec2, vec3);
2942     jcc(Assembler::carryClear, FOUND_CHAR);
2943     addptr(result, 32);
2944     subl(tmp, stride*2);
2945     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
2946     jmp(SCAN_TO_16_CHAR);
2947 
2948     bind(SCAN_TO_16_CHAR_INIT);
2949     movdl(vec1, ch);
2950     pxor(vec2, vec2);
2951     pshufb(vec1, vec2);
2952   }
2953 
2954   bind(SCAN_TO_16_CHAR);
2955   cmpl(cnt1, stride);
2956   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entires left
2957   if (UseAVX < 2) {
2958     movdl(vec1, ch);
2959     pxor(vec2, vec2);
2960     pshufb(vec1, vec2);
2961   }
2962   movl(tmp, cnt1);
2963   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
2964   andl(cnt1,0x0000000F);  //tail count (in bytes)
2965 
2966   bind(SCAN_TO_16_CHAR_LOOP);
2967   movdqu(vec3, Address(result, 0));
2968   pcmpeqb(vec3, vec1);
2969   ptest(vec2, vec3);
2970   jcc(Assembler::carryClear, FOUND_CHAR);
2971   addptr(result, 16);
2972   subl(tmp, stride);
2973   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
2974 
2975   bind(SCAN_TO_CHAR_INIT);
2976   testl(cnt1, cnt1);
2977   jcc(Assembler::zero, RET_NOT_FOUND);
2978   bind(SCAN_TO_CHAR_LOOP);
2979   load_unsigned_byte(tmp, Address(result, 0));
2980   cmpl(ch, tmp);
2981   jccb(Assembler::equal, FOUND_SEQ_CHAR);
2982   addptr(result, 1);
2983   subl(cnt1, 1);
2984   jccb(Assembler::zero, RET_NOT_FOUND);
2985   jmp(SCAN_TO_CHAR_LOOP);
2986 
2987   bind(RET_NOT_FOUND);
2988   movl(result, -1);
2989   jmpb(DONE_LABEL);
2990 
2991   bind(FOUND_CHAR);
2992   if (UseAVX >= 2) {
2993     vpmovmskb(tmp, vec3);
2994   } else {
2995     pmovmskb(tmp, vec3);
2996   }
2997   bsfl(ch, tmp);
2998   addptr(result, ch);
2999 
3000   bind(FOUND_SEQ_CHAR);
3001   subptr(result, str1);
3002 
3003   bind(DONE_LABEL);
3004 } // stringL_indexof_char
3005 
3006 // helper function for string_compare
3007 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3008                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3009                                            Address::ScaleFactor scale2, Register index, int ae) {
3010   if (ae == StrIntrinsicNode::LL) {
3011     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3012     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3013   } else if (ae == StrIntrinsicNode::UU) {
3014     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3015     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3016   } else {
3017     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3018     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3019   }
3020 }
3021 
3022 // Compare strings, used for char[] and byte[].
3023 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3024                                        Register cnt1, Register cnt2, Register result,
3025                                        XMMRegister vec1, int ae, KRegister mask) {
3026   ShortBranchVerifier sbv(this);
3027   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3028   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3029   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3030   int stride2x2 = 0x40;
3031   Address::ScaleFactor scale = Address::no_scale;
3032   Address::ScaleFactor scale1 = Address::no_scale;
3033   Address::ScaleFactor scale2 = Address::no_scale;
3034 
3035   if (ae != StrIntrinsicNode::LL) {
3036     stride2x2 = 0x20;
3037   }
3038 
3039   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3040     shrl(cnt2, 1);
3041   }
3042   // Compute the minimum of the string lengths and the
3043   // difference of the string lengths (stack).
3044   // Do the conditional move stuff
3045   movl(result, cnt1);
3046   subl(cnt1, cnt2);
3047   push(cnt1);
3048   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3049 
3050   // Is the minimum length zero?
3051   testl(cnt2, cnt2);
3052   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3053   if (ae == StrIntrinsicNode::LL) {
3054     // Load first bytes
3055     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3056     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3057   } else if (ae == StrIntrinsicNode::UU) {
3058     // Load first characters
3059     load_unsigned_short(result, Address(str1, 0));
3060     load_unsigned_short(cnt1, Address(str2, 0));
3061   } else {
3062     load_unsigned_byte(result, Address(str1, 0));
3063     load_unsigned_short(cnt1, Address(str2, 0));
3064   }
3065   subl(result, cnt1);
3066   jcc(Assembler::notZero,  POP_LABEL);
3067 
3068   if (ae == StrIntrinsicNode::UU) {
3069     // Divide length by 2 to get number of chars
3070     shrl(cnt2, 1);
3071   }
3072   cmpl(cnt2, 1);
3073   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3074 
3075   // Check if the strings start at the same location and setup scale and stride
3076   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3077     cmpptr(str1, str2);
3078     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3079     if (ae == StrIntrinsicNode::LL) {
3080       scale = Address::times_1;
3081       stride = 16;
3082     } else {
3083       scale = Address::times_2;
3084       stride = 8;
3085     }
3086   } else {
3087     scale1 = Address::times_1;
3088     scale2 = Address::times_2;
3089     // scale not used
3090     stride = 8;
3091   }
3092 
3093   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3094     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3095     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3096     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3097     Label COMPARE_TAIL_LONG;
3098     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3099 
3100     int pcmpmask = 0x19;
3101     if (ae == StrIntrinsicNode::LL) {
3102       pcmpmask &= ~0x01;
3103     }
3104 
3105     // Setup to compare 16-chars (32-bytes) vectors,
3106     // start from first character again because it has aligned address.
3107     if (ae == StrIntrinsicNode::LL) {
3108       stride2 = 32;
3109     } else {
3110       stride2 = 16;
3111     }
3112     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3113       adr_stride = stride << scale;
3114     } else {
3115       adr_stride1 = 8;  //stride << scale1;
3116       adr_stride2 = 16; //stride << scale2;
3117     }
3118 
3119     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3120     // rax and rdx are used by pcmpestri as elements counters
3121     movl(result, cnt2);
3122     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3123     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3124 
3125     // fast path : compare first 2 8-char vectors.
3126     bind(COMPARE_16_CHARS);
3127     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3128       movdqu(vec1, Address(str1, 0));
3129     } else {
3130       pmovzxbw(vec1, Address(str1, 0));
3131     }
3132     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3133     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3134 
3135     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3136       movdqu(vec1, Address(str1, adr_stride));
3137       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3138     } else {
3139       pmovzxbw(vec1, Address(str1, adr_stride1));
3140       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3141     }
3142     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3143     addl(cnt1, stride);
3144 
3145     // Compare the characters at index in cnt1
3146     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3147     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3148     subl(result, cnt2);
3149     jmp(POP_LABEL);
3150 
3151     // Setup the registers to start vector comparison loop
3152     bind(COMPARE_WIDE_VECTORS);
3153     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3154       lea(str1, Address(str1, result, scale));
3155       lea(str2, Address(str2, result, scale));
3156     } else {
3157       lea(str1, Address(str1, result, scale1));
3158       lea(str2, Address(str2, result, scale2));
3159     }
3160     subl(result, stride2);
3161     subl(cnt2, stride2);
3162     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3163     negptr(result);
3164 
3165     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3166     bind(COMPARE_WIDE_VECTORS_LOOP);
3167 
3168 #ifdef _LP64
3169     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3170       cmpl(cnt2, stride2x2);
3171       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3172       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3173       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3174 
3175       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3176       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3177         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3178         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3179       } else {
3180         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3181         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3182       }
3183       kortestql(mask, mask);
3184       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3185       addptr(result, stride2x2);  // update since we already compared at this addr
3186       subl(cnt2, stride2x2);      // and sub the size too
3187       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3188 
3189       vpxor(vec1, vec1);
3190       jmpb(COMPARE_WIDE_TAIL);
3191     }//if (VM_Version::supports_avx512vlbw())
3192 #endif // _LP64
3193 
3194 
3195     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3196     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3197       vmovdqu(vec1, Address(str1, result, scale));
3198       vpxor(vec1, Address(str2, result, scale));
3199     } else {
3200       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3201       vpxor(vec1, Address(str2, result, scale2));
3202     }
3203     vptest(vec1, vec1);
3204     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3205     addptr(result, stride2);
3206     subl(cnt2, stride2);
3207     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3208     // clean upper bits of YMM registers
3209     vpxor(vec1, vec1);
3210 
3211     // compare wide vectors tail
3212     bind(COMPARE_WIDE_TAIL);
3213     testptr(result, result);
3214     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3215 
3216     movl(result, stride2);
3217     movl(cnt2, result);
3218     negptr(result);
3219     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3220 
3221     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3222     bind(VECTOR_NOT_EQUAL);
3223     // clean upper bits of YMM registers
3224     vpxor(vec1, vec1);
3225     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3226       lea(str1, Address(str1, result, scale));
3227       lea(str2, Address(str2, result, scale));
3228     } else {
3229       lea(str1, Address(str1, result, scale1));
3230       lea(str2, Address(str2, result, scale2));
3231     }
3232     jmp(COMPARE_16_CHARS);
3233 
3234     // Compare tail chars, length between 1 to 15 chars
3235     bind(COMPARE_TAIL_LONG);
3236     movl(cnt2, result);
3237     cmpl(cnt2, stride);
3238     jcc(Assembler::less, COMPARE_SMALL_STR);
3239 
3240     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3241       movdqu(vec1, Address(str1, 0));
3242     } else {
3243       pmovzxbw(vec1, Address(str1, 0));
3244     }
3245     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3246     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3247     subptr(cnt2, stride);
3248     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3249     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3250       lea(str1, Address(str1, result, scale));
3251       lea(str2, Address(str2, result, scale));
3252     } else {
3253       lea(str1, Address(str1, result, scale1));
3254       lea(str2, Address(str2, result, scale2));
3255     }
3256     negptr(cnt2);
3257     jmpb(WHILE_HEAD_LABEL);
3258 
3259     bind(COMPARE_SMALL_STR);
3260   } else if (UseSSE42Intrinsics) {
3261     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3262     int pcmpmask = 0x19;
3263     // Setup to compare 8-char (16-byte) vectors,
3264     // start from first character again because it has aligned address.
3265     movl(result, cnt2);
3266     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3267     if (ae == StrIntrinsicNode::LL) {
3268       pcmpmask &= ~0x01;
3269     }
3270     jcc(Assembler::zero, COMPARE_TAIL);
3271     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3272       lea(str1, Address(str1, result, scale));
3273       lea(str2, Address(str2, result, scale));
3274     } else {
3275       lea(str1, Address(str1, result, scale1));
3276       lea(str2, Address(str2, result, scale2));
3277     }
3278     negptr(result);
3279 
3280     // pcmpestri
3281     //   inputs:
3282     //     vec1- substring
3283     //     rax - negative string length (elements count)
3284     //     mem - scanned string
3285     //     rdx - string length (elements count)
3286     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3287     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3288     //   outputs:
3289     //     rcx - first mismatched element index
3290     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3291 
3292     bind(COMPARE_WIDE_VECTORS);
3293     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3294       movdqu(vec1, Address(str1, result, scale));
3295       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3296     } else {
3297       pmovzxbw(vec1, Address(str1, result, scale1));
3298       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3299     }
3300     // After pcmpestri cnt1(rcx) contains mismatched element index
3301 
3302     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3303     addptr(result, stride);
3304     subptr(cnt2, stride);
3305     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3306 
3307     // compare wide vectors tail
3308     testptr(result, result);
3309     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3310 
3311     movl(cnt2, stride);
3312     movl(result, stride);
3313     negptr(result);
3314     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3315       movdqu(vec1, Address(str1, result, scale));
3316       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3317     } else {
3318       pmovzxbw(vec1, Address(str1, result, scale1));
3319       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3320     }
3321     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3322 
3323     // Mismatched characters in the vectors
3324     bind(VECTOR_NOT_EQUAL);
3325     addptr(cnt1, result);
3326     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3327     subl(result, cnt2);
3328     jmpb(POP_LABEL);
3329 
3330     bind(COMPARE_TAIL); // limit is zero
3331     movl(cnt2, result);
3332     // Fallthru to tail compare
3333   }
3334   // Shift str2 and str1 to the end of the arrays, negate min
3335   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3336     lea(str1, Address(str1, cnt2, scale));
3337     lea(str2, Address(str2, cnt2, scale));
3338   } else {
3339     lea(str1, Address(str1, cnt2, scale1));
3340     lea(str2, Address(str2, cnt2, scale2));
3341   }
3342   decrementl(cnt2);  // first character was compared already
3343   negptr(cnt2);
3344 
3345   // Compare the rest of the elements
3346   bind(WHILE_HEAD_LABEL);
3347   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3348   subl(result, cnt1);
3349   jccb(Assembler::notZero, POP_LABEL);
3350   increment(cnt2);
3351   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3352 
3353   // Strings are equal up to min length.  Return the length difference.
3354   bind(LENGTH_DIFF_LABEL);
3355   pop(result);
3356   if (ae == StrIntrinsicNode::UU) {
3357     // Divide diff by 2 to get number of chars
3358     sarl(result, 1);
3359   }
3360   jmpb(DONE_LABEL);
3361 
3362 #ifdef _LP64
3363   if (VM_Version::supports_avx512vlbw()) {
3364 
3365     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3366 
3367     kmovql(cnt1, mask);
3368     notq(cnt1);
3369     bsfq(cnt2, cnt1);
3370     if (ae != StrIntrinsicNode::LL) {
3371       // Divide diff by 2 to get number of chars
3372       sarl(cnt2, 1);
3373     }
3374     addq(result, cnt2);
3375     if (ae == StrIntrinsicNode::LL) {
3376       load_unsigned_byte(cnt1, Address(str2, result));
3377       load_unsigned_byte(result, Address(str1, result));
3378     } else if (ae == StrIntrinsicNode::UU) {
3379       load_unsigned_short(cnt1, Address(str2, result, scale));
3380       load_unsigned_short(result, Address(str1, result, scale));
3381     } else {
3382       load_unsigned_short(cnt1, Address(str2, result, scale2));
3383       load_unsigned_byte(result, Address(str1, result, scale1));
3384     }
3385     subl(result, cnt1);
3386     jmpb(POP_LABEL);
3387   }//if (VM_Version::supports_avx512vlbw())
3388 #endif // _LP64
3389 
3390   // Discard the stored length difference
3391   bind(POP_LABEL);
3392   pop(cnt1);
3393 
3394   // That's it
3395   bind(DONE_LABEL);
3396   if(ae == StrIntrinsicNode::UL) {
3397     negl(result);
3398   }
3399 
3400 }
3401 
3402 // Search for Non-ASCII character (Negative byte value) in a byte array,
3403 // return true if it has any and false otherwise.
3404 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3405 //   @IntrinsicCandidate
3406 //   private static boolean hasNegatives(byte[] ba, int off, int len) {
3407 //     for (int i = off; i < off + len; i++) {
3408 //       if (ba[i] < 0) {
3409 //         return true;
3410 //       }
3411 //     }
3412 //     return false;
3413 //   }
3414 void C2_MacroAssembler::has_negatives(Register ary1, Register len,
3415   Register result, Register tmp1,
3416   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3417   // rsi: byte array
3418   // rcx: len
3419   // rax: result
3420   ShortBranchVerifier sbv(this);
3421   assert_different_registers(ary1, len, result, tmp1);
3422   assert_different_registers(vec1, vec2);
3423   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3424 
3425   // len == 0
3426   testl(len, len);
3427   jcc(Assembler::zero, FALSE_LABEL);
3428 
3429   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3430     VM_Version::supports_avx512vlbw() &&
3431     VM_Version::supports_bmi2()) {
3432 
3433     Label test_64_loop, test_tail;
3434     Register tmp3_aliased = len;
3435 
3436     movl(tmp1, len);
3437     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3438 
3439     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
3440     andl(len, ~(64 - 1));    // vector count (in chars)
3441     jccb(Assembler::zero, test_tail);
3442 
3443     lea(ary1, Address(ary1, len, Address::times_1));
3444     negptr(len);
3445 
3446     bind(test_64_loop);
3447     // Check whether our 64 elements of size byte contain negatives
3448     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3449     kortestql(mask1, mask1);
3450     jcc(Assembler::notZero, TRUE_LABEL);
3451 
3452     addptr(len, 64);
3453     jccb(Assembler::notZero, test_64_loop);
3454 
3455 
3456     bind(test_tail);
3457     // bail out when there is nothing to be done
3458     testl(tmp1, -1);
3459     jcc(Assembler::zero, FALSE_LABEL);
3460 
3461     // ~(~0 << len) applied up to two times (for 32-bit scenario)
3462 #ifdef _LP64
3463     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3464     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3465     notq(tmp3_aliased);
3466     kmovql(mask2, tmp3_aliased);
3467 #else
3468     Label k_init;
3469     jmp(k_init);
3470 
3471     // We could not read 64-bits from a general purpose register thus we move
3472     // data required to compose 64 1's to the instruction stream
3473     // We emit 64 byte wide series of elements from 0..63 which later on would
3474     // be used as a compare targets with tail count contained in tmp1 register.
3475     // Result would be a k register having tmp1 consecutive number or 1
3476     // counting from least significant bit.
3477     address tmp = pc();
3478     emit_int64(0x0706050403020100);
3479     emit_int64(0x0F0E0D0C0B0A0908);
3480     emit_int64(0x1716151413121110);
3481     emit_int64(0x1F1E1D1C1B1A1918);
3482     emit_int64(0x2726252423222120);
3483     emit_int64(0x2F2E2D2C2B2A2928);
3484     emit_int64(0x3736353433323130);
3485     emit_int64(0x3F3E3D3C3B3A3938);
3486 
3487     bind(k_init);
3488     lea(len, InternalAddress(tmp));
3489     // create mask to test for negative byte inside a vector
3490     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3491     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
3492 
3493 #endif
3494     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3495     ktestq(mask1, mask2);
3496     jcc(Assembler::notZero, TRUE_LABEL);
3497 
3498     jmp(FALSE_LABEL);
3499   } else {
3500     movl(result, len); // copy
3501 
3502     if (UseAVX >= 2 && UseSSE >= 2) {
3503       // With AVX2, use 32-byte vector compare
3504       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3505 
3506       // Compare 32-byte vectors
3507       andl(result, 0x0000001f);  //   tail count (in bytes)
3508       andl(len, 0xffffffe0);   // vector count (in bytes)
3509       jccb(Assembler::zero, COMPARE_TAIL);
3510 
3511       lea(ary1, Address(ary1, len, Address::times_1));
3512       negptr(len);
3513 
3514       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
3515       movdl(vec2, tmp1);
3516       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
3517 
3518       bind(COMPARE_WIDE_VECTORS);
3519       vmovdqu(vec1, Address(ary1, len, Address::times_1));
3520       vptest(vec1, vec2);
3521       jccb(Assembler::notZero, TRUE_LABEL);
3522       addptr(len, 32);
3523       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3524 
3525       testl(result, result);
3526       jccb(Assembler::zero, FALSE_LABEL);
3527 
3528       vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3529       vptest(vec1, vec2);
3530       jccb(Assembler::notZero, TRUE_LABEL);
3531       jmpb(FALSE_LABEL);
3532 
3533       bind(COMPARE_TAIL); // len is zero
3534       movl(len, result);
3535       // Fallthru to tail compare
3536     } else if (UseSSE42Intrinsics) {
3537       // With SSE4.2, use double quad vector compare
3538       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3539 
3540       // Compare 16-byte vectors
3541       andl(result, 0x0000000f);  //   tail count (in bytes)
3542       andl(len, 0xfffffff0);   // vector count (in bytes)
3543       jcc(Assembler::zero, COMPARE_TAIL);
3544 
3545       lea(ary1, Address(ary1, len, Address::times_1));
3546       negptr(len);
3547 
3548       movl(tmp1, 0x80808080);
3549       movdl(vec2, tmp1);
3550       pshufd(vec2, vec2, 0);
3551 
3552       bind(COMPARE_WIDE_VECTORS);
3553       movdqu(vec1, Address(ary1, len, Address::times_1));
3554       ptest(vec1, vec2);
3555       jcc(Assembler::notZero, TRUE_LABEL);
3556       addptr(len, 16);
3557       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3558 
3559       testl(result, result);
3560       jcc(Assembler::zero, FALSE_LABEL);
3561 
3562       movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3563       ptest(vec1, vec2);
3564       jccb(Assembler::notZero, TRUE_LABEL);
3565       jmpb(FALSE_LABEL);
3566 
3567       bind(COMPARE_TAIL); // len is zero
3568       movl(len, result);
3569       // Fallthru to tail compare
3570     }
3571   }
3572   // Compare 4-byte vectors
3573   andl(len, 0xfffffffc); // vector count (in bytes)
3574   jccb(Assembler::zero, COMPARE_CHAR);
3575 
3576   lea(ary1, Address(ary1, len, Address::times_1));
3577   negptr(len);
3578 
3579   bind(COMPARE_VECTORS);
3580   movl(tmp1, Address(ary1, len, Address::times_1));
3581   andl(tmp1, 0x80808080);
3582   jccb(Assembler::notZero, TRUE_LABEL);
3583   addptr(len, 4);
3584   jcc(Assembler::notZero, COMPARE_VECTORS);
3585 
3586   // Compare trailing char (final 2 bytes), if any
3587   bind(COMPARE_CHAR);
3588   testl(result, 0x2);   // tail  char
3589   jccb(Assembler::zero, COMPARE_BYTE);
3590   load_unsigned_short(tmp1, Address(ary1, 0));
3591   andl(tmp1, 0x00008080);
3592   jccb(Assembler::notZero, TRUE_LABEL);
3593   subptr(result, 2);
3594   lea(ary1, Address(ary1, 2));
3595 
3596   bind(COMPARE_BYTE);
3597   testl(result, 0x1);   // tail  byte
3598   jccb(Assembler::zero, FALSE_LABEL);
3599   load_unsigned_byte(tmp1, Address(ary1, 0));
3600   andl(tmp1, 0x00000080);
3601   jccb(Assembler::notEqual, TRUE_LABEL);
3602   jmpb(FALSE_LABEL);
3603 
3604   bind(TRUE_LABEL);
3605   movl(result, 1);   // return true
3606   jmpb(DONE);
3607 
3608   bind(FALSE_LABEL);
3609   xorl(result, result); // return false
3610 
3611   // That's it
3612   bind(DONE);
3613   if (UseAVX >= 2 && UseSSE >= 2) {
3614     // clean upper bits of YMM registers
3615     vpxor(vec1, vec1);
3616     vpxor(vec2, vec2);
3617   }
3618 }
3619 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
3620 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
3621                                       Register limit, Register result, Register chr,
3622                                       XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
3623   ShortBranchVerifier sbv(this);
3624   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
3625 
3626   int length_offset  = arrayOopDesc::length_offset_in_bytes();
3627   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
3628 
3629   if (is_array_equ) {
3630     // Check the input args
3631     cmpoop(ary1, ary2);
3632     jcc(Assembler::equal, TRUE_LABEL);
3633 
3634     // Need additional checks for arrays_equals.
3635     testptr(ary1, ary1);
3636     jcc(Assembler::zero, FALSE_LABEL);
3637     testptr(ary2, ary2);
3638     jcc(Assembler::zero, FALSE_LABEL);
3639 
3640     // Check the lengths
3641     movl(limit, Address(ary1, length_offset));
3642     cmpl(limit, Address(ary2, length_offset));
3643     jcc(Assembler::notEqual, FALSE_LABEL);
3644   }
3645 
3646   // count == 0
3647   testl(limit, limit);
3648   jcc(Assembler::zero, TRUE_LABEL);
3649 
3650   if (is_array_equ) {
3651     // Load array address
3652     lea(ary1, Address(ary1, base_offset));
3653     lea(ary2, Address(ary2, base_offset));
3654   }
3655 
3656   if (is_array_equ && is_char) {
3657     // arrays_equals when used for char[].
3658     shll(limit, 1);      // byte count != 0
3659   }
3660   movl(result, limit); // copy
3661 
3662   if (UseAVX >= 2) {
3663     // With AVX2, use 32-byte vector compare
3664     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3665 
3666     // Compare 32-byte vectors
3667     andl(result, 0x0000001f);  //   tail count (in bytes)
3668     andl(limit, 0xffffffe0);   // vector count (in bytes)
3669     jcc(Assembler::zero, COMPARE_TAIL);
3670 
3671     lea(ary1, Address(ary1, limit, Address::times_1));
3672     lea(ary2, Address(ary2, limit, Address::times_1));
3673     negptr(limit);
3674 
3675 #ifdef _LP64
3676     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3677       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
3678 
3679       cmpl(limit, -64);
3680       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3681 
3682       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3683 
3684       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
3685       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
3686       kortestql(mask, mask);
3687       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3688       addptr(limit, 64);  // update since we already compared at this addr
3689       cmpl(limit, -64);
3690       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3691 
3692       // At this point we may still need to compare -limit+result bytes.
3693       // We could execute the next two instruction and just continue via non-wide path:
3694       //  cmpl(limit, 0);
3695       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
3696       // But since we stopped at the points ary{1,2}+limit which are
3697       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
3698       // (|limit| <= 32 and result < 32),
3699       // we may just compare the last 64 bytes.
3700       //
3701       addptr(result, -64);   // it is safe, bc we just came from this area
3702       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
3703       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
3704       kortestql(mask, mask);
3705       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3706 
3707       jmp(TRUE_LABEL);
3708 
3709       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3710 
3711     }//if (VM_Version::supports_avx512vlbw())
3712 #endif //_LP64
3713     bind(COMPARE_WIDE_VECTORS);
3714     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
3715     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
3716     vpxor(vec1, vec2);
3717 
3718     vptest(vec1, vec1);
3719     jcc(Assembler::notZero, FALSE_LABEL);
3720     addptr(limit, 32);
3721     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3722 
3723     testl(result, result);
3724     jcc(Assembler::zero, TRUE_LABEL);
3725 
3726     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3727     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
3728     vpxor(vec1, vec2);
3729 
3730     vptest(vec1, vec1);
3731     jccb(Assembler::notZero, FALSE_LABEL);
3732     jmpb(TRUE_LABEL);
3733 
3734     bind(COMPARE_TAIL); // limit is zero
3735     movl(limit, result);
3736     // Fallthru to tail compare
3737   } else if (UseSSE42Intrinsics) {
3738     // With SSE4.2, use double quad vector compare
3739     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3740 
3741     // Compare 16-byte vectors
3742     andl(result, 0x0000000f);  //   tail count (in bytes)
3743     andl(limit, 0xfffffff0);   // vector count (in bytes)
3744     jcc(Assembler::zero, COMPARE_TAIL);
3745 
3746     lea(ary1, Address(ary1, limit, Address::times_1));
3747     lea(ary2, Address(ary2, limit, Address::times_1));
3748     negptr(limit);
3749 
3750     bind(COMPARE_WIDE_VECTORS);
3751     movdqu(vec1, Address(ary1, limit, Address::times_1));
3752     movdqu(vec2, Address(ary2, limit, Address::times_1));
3753     pxor(vec1, vec2);
3754 
3755     ptest(vec1, vec1);
3756     jcc(Assembler::notZero, FALSE_LABEL);
3757     addptr(limit, 16);
3758     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3759 
3760     testl(result, result);
3761     jcc(Assembler::zero, TRUE_LABEL);
3762 
3763     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3764     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
3765     pxor(vec1, vec2);
3766 
3767     ptest(vec1, vec1);
3768     jccb(Assembler::notZero, FALSE_LABEL);
3769     jmpb(TRUE_LABEL);
3770 
3771     bind(COMPARE_TAIL); // limit is zero
3772     movl(limit, result);
3773     // Fallthru to tail compare
3774   }
3775 
3776   // Compare 4-byte vectors
3777   andl(limit, 0xfffffffc); // vector count (in bytes)
3778   jccb(Assembler::zero, COMPARE_CHAR);
3779 
3780   lea(ary1, Address(ary1, limit, Address::times_1));
3781   lea(ary2, Address(ary2, limit, Address::times_1));
3782   negptr(limit);
3783 
3784   bind(COMPARE_VECTORS);
3785   movl(chr, Address(ary1, limit, Address::times_1));
3786   cmpl(chr, Address(ary2, limit, Address::times_1));
3787   jccb(Assembler::notEqual, FALSE_LABEL);
3788   addptr(limit, 4);
3789   jcc(Assembler::notZero, COMPARE_VECTORS);
3790 
3791   // Compare trailing char (final 2 bytes), if any
3792   bind(COMPARE_CHAR);
3793   testl(result, 0x2);   // tail  char
3794   jccb(Assembler::zero, COMPARE_BYTE);
3795   load_unsigned_short(chr, Address(ary1, 0));
3796   load_unsigned_short(limit, Address(ary2, 0));
3797   cmpl(chr, limit);
3798   jccb(Assembler::notEqual, FALSE_LABEL);
3799 
3800   if (is_array_equ && is_char) {
3801     bind(COMPARE_BYTE);
3802   } else {
3803     lea(ary1, Address(ary1, 2));
3804     lea(ary2, Address(ary2, 2));
3805 
3806     bind(COMPARE_BYTE);
3807     testl(result, 0x1);   // tail  byte
3808     jccb(Assembler::zero, TRUE_LABEL);
3809     load_unsigned_byte(chr, Address(ary1, 0));
3810     load_unsigned_byte(limit, Address(ary2, 0));
3811     cmpl(chr, limit);
3812     jccb(Assembler::notEqual, FALSE_LABEL);
3813   }
3814   bind(TRUE_LABEL);
3815   movl(result, 1);   // return true
3816   jmpb(DONE);
3817 
3818   bind(FALSE_LABEL);
3819   xorl(result, result); // return false
3820 
3821   // That's it
3822   bind(DONE);
3823   if (UseAVX >= 2) {
3824     // clean upper bits of YMM registers
3825     vpxor(vec1, vec1);
3826     vpxor(vec2, vec2);
3827   }
3828 }
3829 
3830 #ifdef _LP64
3831 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
3832                                               Register tmp, KRegister ktmp, int masklen, int vec_enc) {
3833   assert(VM_Version::supports_avx512vlbw(), "");
3834   vpxor(xtmp, xtmp, xtmp, vec_enc);
3835   vpsubb(xtmp, xtmp, mask, vec_enc);
3836   evpmovb2m(ktmp, xtmp, vec_enc);
3837   kmovql(tmp, ktmp);
3838   switch(opc) {
3839     case Op_VectorMaskTrueCount:
3840       popcntq(dst, tmp);
3841       break;
3842     case Op_VectorMaskLastTrue:
3843       mov64(dst, -1);
3844       bsrq(tmp, tmp);
3845       cmov(Assembler::notZero, dst, tmp);
3846       break;
3847     case Op_VectorMaskFirstTrue:
3848       mov64(dst, masklen);
3849       bsfq(tmp, tmp);
3850       cmov(Assembler::notZero, dst, tmp);
3851       break;
3852     default: assert(false, "Unhandled mask operation");
3853   }
3854 }
3855 
3856 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
3857                                               XMMRegister xtmp1, Register tmp, int masklen, int vec_enc) {
3858   assert(VM_Version::supports_avx(), "");
3859   vpxor(xtmp, xtmp, xtmp, vec_enc);
3860   vpsubb(xtmp, xtmp, mask, vec_enc);
3861   vpmovmskb(tmp, xtmp, vec_enc);
3862   if (masklen < 64) {
3863     andq(tmp, (((jlong)1 << masklen) - 1));
3864   }
3865   switch(opc) {
3866     case Op_VectorMaskTrueCount:
3867       popcntq(dst, tmp);
3868       break;
3869     case Op_VectorMaskLastTrue:
3870       mov64(dst, -1);
3871       bsrq(tmp, tmp);
3872       cmov(Assembler::notZero, dst, tmp);
3873       break;
3874     case Op_VectorMaskFirstTrue:
3875       mov64(dst, masklen);
3876       bsfq(tmp, tmp);
3877       cmov(Assembler::notZero, dst, tmp);
3878       break;
3879     default: assert(false, "Unhandled mask operation");
3880   }
3881 }
3882 #endif
--- EOF ---