1 /*
   2  * Copyright (c) 2020, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "oops/methodData.hpp"
  29 #include "opto/c2_MacroAssembler.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/opcodes.hpp"
  32 #include "opto/subnode.hpp"
  33 #include "runtime/objectMonitor.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 
  36 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
  37   switch (vlen_in_bytes) {
  38     case  4: // fall-through
  39     case  8: // fall-through
  40     case 16: return Assembler::AVX_128bit;
  41     case 32: return Assembler::AVX_256bit;
  42     case 64: return Assembler::AVX_512bit;
  43 
  44     default: {
  45       ShouldNotReachHere();
  46       return Assembler::AVX_NoVec;
  47     }
  48   }
  49 }
  50 
  51 void C2_MacroAssembler::setvectmask(Register dst, Register src, KRegister mask) {
  52   guarantee(PostLoopMultiversioning, "must be");
  53   Assembler::movl(dst, 1);
  54   Assembler::shlxl(dst, dst, src);
  55   Assembler::decl(dst);
  56   Assembler::kmovdl(mask, dst);
  57   Assembler::movl(dst, src);
  58 }
  59 
  60 void C2_MacroAssembler::restorevectmask(KRegister mask) {
  61   guarantee(PostLoopMultiversioning, "must be");
  62   Assembler::knotwl(mask, k0);
  63 }
  64 
  65 #if INCLUDE_RTM_OPT
  66 
  67 // Update rtm_counters based on abort status
  68 // input: abort_status
  69 //        rtm_counters (RTMLockingCounters*)
  70 // flags are killed
  71 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
  72 
  73   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
  74   if (PrintPreciseRTMLockingStatistics) {
  75     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
  76       Label check_abort;
  77       testl(abort_status, (1<<i));
  78       jccb(Assembler::equal, check_abort);
  79       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
  80       bind(check_abort);
  81     }
  82   }
  83 }
  84 
  85 // Branch if (random & (count-1) != 0), count is 2^n
  86 // tmp, scr and flags are killed
  87 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
  88   assert(tmp == rax, "");
  89   assert(scr == rdx, "");
  90   rdtsc(); // modifies EDX:EAX
  91   andptr(tmp, count-1);
  92   jccb(Assembler::notZero, brLabel);
  93 }
  94 
  95 // Perform abort ratio calculation, set no_rtm bit if high ratio
  96 // input:  rtm_counters_Reg (RTMLockingCounters* address)
  97 // tmpReg, rtm_counters_Reg and flags are killed
  98 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
  99                                                     Register rtm_counters_Reg,
 100                                                     RTMLockingCounters* rtm_counters,
 101                                                     Metadata* method_data) {
 102   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 103 
 104   if (RTMLockingCalculationDelay > 0) {
 105     // Delay calculation
 106     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
 107     testptr(tmpReg, tmpReg);
 108     jccb(Assembler::equal, L_done);
 109   }
 110   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 111   //   Aborted transactions = abort_count * 100
 112   //   All transactions = total_count *  RTMTotalCountIncrRate
 113   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 114 
 115   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 116   cmpptr(tmpReg, RTMAbortThreshold);
 117   jccb(Assembler::below, L_check_always_rtm2);
 118   imulptr(tmpReg, tmpReg, 100);
 119 
 120   Register scrReg = rtm_counters_Reg;
 121   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 122   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 123   imulptr(scrReg, scrReg, RTMAbortRatio);
 124   cmpptr(tmpReg, scrReg);
 125   jccb(Assembler::below, L_check_always_rtm1);
 126   if (method_data != NULL) {
 127     // set rtm_state to "no rtm" in MDO
 128     mov_metadata(tmpReg, method_data);
 129     lock();
 130     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
 131   }
 132   jmpb(L_done);
 133   bind(L_check_always_rtm1);
 134   // Reload RTMLockingCounters* address
 135   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 136   bind(L_check_always_rtm2);
 137   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 138   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 139   jccb(Assembler::below, L_done);
 140   if (method_data != NULL) {
 141     // set rtm_state to "always rtm" in MDO
 142     mov_metadata(tmpReg, method_data);
 143     lock();
 144     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
 145   }
 146   bind(L_done);
 147 }
 148 
 149 // Update counters and perform abort ratio calculation
 150 // input:  abort_status_Reg
 151 // rtm_counters_Reg, flags are killed
 152 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 153                                       Register rtm_counters_Reg,
 154                                       RTMLockingCounters* rtm_counters,
 155                                       Metadata* method_data,
 156                                       bool profile_rtm) {
 157 
 158   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 159   // update rtm counters based on rax value at abort
 160   // reads abort_status_Reg, updates flags
 161   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 162   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 163   if (profile_rtm) {
 164     // Save abort status because abort_status_Reg is used by following code.
 165     if (RTMRetryCount > 0) {
 166       push(abort_status_Reg);
 167     }
 168     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 169     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 170     // restore abort status
 171     if (RTMRetryCount > 0) {
 172       pop(abort_status_Reg);
 173     }
 174   }
 175 }
 176 
 177 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 178 // inputs: retry_count_Reg
 179 //       : abort_status_Reg
 180 // output: retry_count_Reg decremented by 1
 181 // flags are killed
 182 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 183   Label doneRetry;
 184   assert(abort_status_Reg == rax, "");
 185   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 186   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 187   // if reason is in 0x6 and retry count != 0 then retry
 188   andptr(abort_status_Reg, 0x6);
 189   jccb(Assembler::zero, doneRetry);
 190   testl(retry_count_Reg, retry_count_Reg);
 191   jccb(Assembler::zero, doneRetry);
 192   pause();
 193   decrementl(retry_count_Reg);
 194   jmp(retryLabel);
 195   bind(doneRetry);
 196 }
 197 
 198 // Spin and retry if lock is busy,
 199 // inputs: box_Reg (monitor address)
 200 //       : retry_count_Reg
 201 // output: retry_count_Reg decremented by 1
 202 //       : clear z flag if retry count exceeded
 203 // tmp_Reg, scr_Reg, flags are killed
 204 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 205                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 206   Label SpinLoop, SpinExit, doneRetry;
 207   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 208 
 209   testl(retry_count_Reg, retry_count_Reg);
 210   jccb(Assembler::zero, doneRetry);
 211   decrementl(retry_count_Reg);
 212   movptr(scr_Reg, RTMSpinLoopCount);
 213 
 214   bind(SpinLoop);
 215   pause();
 216   decrementl(scr_Reg);
 217   jccb(Assembler::lessEqual, SpinExit);
 218   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 219   testptr(tmp_Reg, tmp_Reg);
 220   jccb(Assembler::notZero, SpinLoop);
 221 
 222   bind(SpinExit);
 223   jmp(retryLabel);
 224   bind(doneRetry);
 225   incrementl(retry_count_Reg); // clear z flag
 226 }
 227 
 228 // Use RTM for normal stack locks
 229 // Input: objReg (object to lock)
 230 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 231                                          Register retry_on_abort_count_Reg,
 232                                          RTMLockingCounters* stack_rtm_counters,
 233                                          Metadata* method_data, bool profile_rtm,
 234                                          Label& DONE_LABEL, Label& IsInflated) {
 235   assert(UseRTMForStackLocks, "why call this otherwise?");
 236   assert(tmpReg == rax, "");
 237   assert(scrReg == rdx, "");
 238   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 239 
 240   if (RTMRetryCount > 0) {
 241     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 242     bind(L_rtm_retry);
 243   }
 244   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 245   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral
 246   jcc(Assembler::notZero, IsInflated);
 247 
 248   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 249     Label L_noincrement;
 250     if (RTMTotalCountIncrRate > 1) {
 251       // tmpReg, scrReg and flags are killed
 252       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 253     }
 254     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
 255     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 256     bind(L_noincrement);
 257   }
 258   xbegin(L_on_abort);
 259   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 260   andptr(tmpReg, markWord::lock_mask_in_place);     // look at 2 lock bits
 261   cmpptr(tmpReg, markWord::unlocked_value);         // bits = 01 unlocked
 262   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 263 
 264   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 265   if (UseRTMXendForLockBusy) {
 266     xend();
 267     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 268     jmp(L_decrement_retry);
 269   }
 270   else {
 271     xabort(0);
 272   }
 273   bind(L_on_abort);
 274   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 275     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 276   }
 277   bind(L_decrement_retry);
 278   if (RTMRetryCount > 0) {
 279     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 280     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 281   }
 282 }
 283 
 284 // Use RTM for inflating locks
 285 // inputs: objReg (object to lock)
 286 //         boxReg (on-stack box address (displaced header location) - KILLED)
 287 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 288 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 289                                             Register scrReg, Register retry_on_busy_count_Reg,
 290                                             Register retry_on_abort_count_Reg,
 291                                             RTMLockingCounters* rtm_counters,
 292                                             Metadata* method_data, bool profile_rtm,
 293                                             Label& DONE_LABEL) {
 294   assert(UseRTMLocking, "why call this otherwise?");
 295   assert(tmpReg == rax, "");
 296   assert(scrReg == rdx, "");
 297   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 298   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 299 
 300   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 301   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
 302   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 303 
 304   if (RTMRetryCount > 0) {
 305     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 306     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 307     bind(L_rtm_retry);
 308   }
 309   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 310     Label L_noincrement;
 311     if (RTMTotalCountIncrRate > 1) {
 312       // tmpReg, scrReg and flags are killed
 313       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 314     }
 315     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 316     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 317     bind(L_noincrement);
 318   }
 319   xbegin(L_on_abort);
 320   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 321   movptr(tmpReg, Address(tmpReg, owner_offset));
 322   testptr(tmpReg, tmpReg);
 323   jcc(Assembler::zero, DONE_LABEL);
 324   if (UseRTMXendForLockBusy) {
 325     xend();
 326     jmp(L_decrement_retry);
 327   }
 328   else {
 329     xabort(0);
 330   }
 331   bind(L_on_abort);
 332   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 333   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 334     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 335   }
 336   if (RTMRetryCount > 0) {
 337     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 338     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 339   }
 340 
 341   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 342   testptr(tmpReg, tmpReg) ;
 343   jccb(Assembler::notZero, L_decrement_retry) ;
 344 
 345   // Appears unlocked - try to swing _owner from null to non-null.
 346   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 347 #ifdef _LP64
 348   Register threadReg = r15_thread;
 349 #else
 350   get_thread(scrReg);
 351   Register threadReg = scrReg;
 352 #endif
 353   lock();
 354   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 355 
 356   if (RTMRetryCount > 0) {
 357     // success done else retry
 358     jccb(Assembler::equal, DONE_LABEL) ;
 359     bind(L_decrement_retry);
 360     // Spin and retry if lock is busy.
 361     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 362   }
 363   else {
 364     bind(L_decrement_retry);
 365   }
 366 }
 367 
 368 #endif //  INCLUDE_RTM_OPT
 369 
 370 // fast_lock and fast_unlock used by C2
 371 
 372 // Because the transitions from emitted code to the runtime
 373 // monitorenter/exit helper stubs are so slow it's critical that
 374 // we inline both the stack-locking fast path and the inflated fast path.
 375 //
 376 // See also: cmpFastLock and cmpFastUnlock.
 377 //
 378 // What follows is a specialized inline transliteration of the code
 379 // in enter() and exit(). If we're concerned about I$ bloat another
 380 // option would be to emit TrySlowEnter and TrySlowExit methods
 381 // at startup-time.  These methods would accept arguments as
 382 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 383 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 384 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 385 // In practice, however, the # of lock sites is bounded and is usually small.
 386 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 387 // if the processor uses simple bimodal branch predictors keyed by EIP
 388 // Since the helper routines would be called from multiple synchronization
 389 // sites.
 390 //
 391 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 392 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 393 // to those specialized methods.  That'd give us a mostly platform-independent
 394 // implementation that the JITs could optimize and inline at their pleasure.
 395 // Done correctly, the only time we'd need to cross to native could would be
 396 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 397 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 398 // (b) explicit barriers or fence operations.
 399 //
 400 // TODO:
 401 //
 402 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 403 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 404 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 405 //    the lock operators would typically be faster than reifying Self.
 406 //
 407 // *  Ideally I'd define the primitives as:
 408 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 409 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 410 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 411 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 412 //    Furthermore the register assignments are overconstrained, possibly resulting in
 413 //    sub-optimal code near the synchronization site.
 414 //
 415 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 416 //    Alternately, use a better sp-proximity test.
 417 //
 418 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 419 //    Either one is sufficient to uniquely identify a thread.
 420 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 421 //
 422 // *  Intrinsify notify() and notifyAll() for the common cases where the
 423 //    object is locked by the calling thread but the waitlist is empty.
 424 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 425 //
 426 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 427 //    But beware of excessive branch density on AMD Opterons.
 428 //
 429 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 430 //    or failure of the fast path.  If the fast path fails then we pass
 431 //    control to the slow path, typically in C.  In fast_lock and
 432 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 433 //    will emit a conditional branch immediately after the node.
 434 //    So we have branches to branches and lots of ICC.ZF games.
 435 //    Instead, it might be better to have C2 pass a "FailureLabel"
 436 //    into fast_lock and fast_unlock.  In the case of success, control
 437 //    will drop through the node.  ICC.ZF is undefined at exit.
 438 //    In the case of failure, the node will branch directly to the
 439 //    FailureLabel
 440 
 441 
 442 // obj: object to lock
 443 // box: on-stack box address (displaced header location) - KILLED
 444 // rax,: tmp -- KILLED
 445 // scr: tmp -- KILLED
 446 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 447                                  Register scrReg, Register cx1Reg, Register cx2Reg,
 448                                  RTMLockingCounters* rtm_counters,
 449                                  RTMLockingCounters* stack_rtm_counters,
 450                                  Metadata* method_data,
 451                                  bool use_rtm, bool profile_rtm) {
 452   // Ensure the register assignments are disjoint
 453   assert(tmpReg == rax, "");
 454 
 455   if (use_rtm) {
 456     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 457   } else {
 458     assert(cx2Reg == noreg, "");
 459     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 460   }
 461 
 462   // Possible cases that we'll encounter in fast_lock
 463   // ------------------------------------------------
 464   // * Inflated
 465   //    -- unlocked
 466   //    -- Locked
 467   //       = by self
 468   //       = by other
 469   // * neutral
 470   // * stack-locked
 471   //    -- by self
 472   //       = sp-proximity test hits
 473   //       = sp-proximity test generates false-negative
 474   //    -- by other
 475   //
 476 
 477   Label IsInflated, DONE_LABEL;
 478 
 479   if (DiagnoseSyncOnValueBasedClasses != 0) {
 480     load_klass(tmpReg, objReg, cx1Reg);
 481     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 482     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 483     jcc(Assembler::notZero, DONE_LABEL);
 484   }
 485 
 486 #if INCLUDE_RTM_OPT
 487   if (UseRTMForStackLocks && use_rtm) {
 488     assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive");
 489     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 490                       stack_rtm_counters, method_data, profile_rtm,
 491                       DONE_LABEL, IsInflated);
 492   }
 493 #endif // INCLUDE_RTM_OPT
 494 
 495   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 496   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 497   jccb(Assembler::notZero, IsInflated);
 498 
 499   if (!UseHeavyMonitors) {
 500     // Attempt stack-locking ...
 501     orptr (tmpReg, markWord::unlocked_value);
 502     if (EnableValhalla) {
 503       // Mask inline_type bit such that we go to the slow path if object is an inline type
 504       andptr(tmpReg, ~((int) markWord::inline_type_bit_in_place));
 505     }
 506     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 507     lock();
 508     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 509     jcc(Assembler::equal, DONE_LABEL);           // Success
 510 
 511     // Recursive locking.
 512     // The object is stack-locked: markword contains stack pointer to BasicLock.
 513     // Locked by current thread if difference with current SP is less than one page.
 514     subptr(tmpReg, rsp);
 515     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 516     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
 517     movptr(Address(boxReg, 0), tmpReg);
 518   } else {
 519     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 520     testptr(objReg, objReg);
 521   }
 522   jmp(DONE_LABEL);
 523 
 524   bind(IsInflated);
 525   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 526 
 527 #if INCLUDE_RTM_OPT
 528   // Use the same RTM locking code in 32- and 64-bit VM.
 529   if (use_rtm) {
 530     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 531                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 532   } else {
 533 #endif // INCLUDE_RTM_OPT
 534 
 535 #ifndef _LP64
 536   // The object is inflated.
 537 
 538   // boxReg refers to the on-stack BasicLock in the current frame.
 539   // We'd like to write:
 540   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 541   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 542   // additional latency as we have another ST in the store buffer that must drain.
 543 
 544   // avoid ST-before-CAS
 545   // register juggle because we need tmpReg for cmpxchgptr below
 546   movptr(scrReg, boxReg);
 547   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 548 
 549   // Optimistic form: consider XORL tmpReg,tmpReg
 550   movptr(tmpReg, NULL_WORD);
 551 
 552   // Appears unlocked - try to swing _owner from null to non-null.
 553   // Ideally, I'd manifest "Self" with get_thread and then attempt
 554   // to CAS the register containing Self into m->Owner.
 555   // But we don't have enough registers, so instead we can either try to CAS
 556   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 557   // we later store "Self" into m->Owner.  Transiently storing a stack address
 558   // (rsp or the address of the box) into  m->owner is harmless.
 559   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 560   lock();
 561   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 562   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 563   // If we weren't able to swing _owner from NULL to the BasicLock
 564   // then take the slow path.
 565   jccb  (Assembler::notZero, DONE_LABEL);
 566   // update _owner from BasicLock to thread
 567   get_thread (scrReg);                    // beware: clobbers ICCs
 568   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 569   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 570 
 571   // If the CAS fails we can either retry or pass control to the slow path.
 572   // We use the latter tactic.
 573   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 574   // If the CAS was successful ...
 575   //   Self has acquired the lock
 576   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 577   // Intentional fall-through into DONE_LABEL ...
 578 #else // _LP64
 579   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 580   movq(scrReg, tmpReg);
 581   xorq(tmpReg, tmpReg);
 582   lock();
 583   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 584   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 585   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 586   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
 587   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 588   jcc(Assembler::equal, DONE_LABEL);           // CAS above succeeded; propagate ZF = 1 (success)
 589 
 590   cmpptr(r15_thread, rax);                     // Check if we are already the owner (recursive lock)
 591   jcc(Assembler::notEqual, DONE_LABEL);        // If not recursive, ZF = 0 at this point (fail)
 592   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 593   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 594 #endif // _LP64
 595 #if INCLUDE_RTM_OPT
 596   } // use_rtm()
 597 #endif
 598   // DONE_LABEL is a hot target - we'd really like to place it at the
 599   // start of cache line by padding with NOPs.
 600   // See the AMD and Intel software optimization manuals for the
 601   // most efficient "long" NOP encodings.
 602   // Unfortunately none of our alignment mechanisms suffice.
 603   bind(DONE_LABEL);
 604 
 605   // At DONE_LABEL the icc ZFlag is set as follows ...
 606   // fast_unlock uses the same protocol.
 607   // ZFlag == 1 -> Success
 608   // ZFlag == 0 -> Failure - force control through the slow path
 609 }
 610 
 611 // obj: object to unlock
 612 // box: box address (displaced header location), killed.  Must be EAX.
 613 // tmp: killed, cannot be obj nor box.
 614 //
 615 // Some commentary on balanced locking:
 616 //
 617 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 618 // Methods that don't have provably balanced locking are forced to run in the
 619 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 620 // The interpreter provides two properties:
 621 // I1:  At return-time the interpreter automatically and quietly unlocks any
 622 //      objects acquired the current activation (frame).  Recall that the
 623 //      interpreter maintains an on-stack list of locks currently held by
 624 //      a frame.
 625 // I2:  If a method attempts to unlock an object that is not held by the
 626 //      the frame the interpreter throws IMSX.
 627 //
 628 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 629 // B() doesn't have provably balanced locking so it runs in the interpreter.
 630 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 631 // is still locked by A().
 632 //
 633 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 634 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 635 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 636 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 637 // Arguably given that the spec legislates the JNI case as undefined our implementation
 638 // could reasonably *avoid* checking owner in fast_unlock().
 639 // In the interest of performance we elide m->Owner==Self check in unlock.
 640 // A perfectly viable alternative is to elide the owner check except when
 641 // Xcheck:jni is enabled.
 642 
 643 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 644   assert(boxReg == rax, "");
 645   assert_different_registers(objReg, boxReg, tmpReg);
 646 
 647   Label DONE_LABEL, Stacked, CheckSucc;
 648 
 649 #if INCLUDE_RTM_OPT
 650   if (UseRTMForStackLocks && use_rtm) {
 651     assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive");
 652     Label L_regular_unlock;
 653     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 654     andptr(tmpReg, markWord::lock_mask_in_place);                     // look at 2 lock bits
 655     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 01 unlocked
 656     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 657     xend();                                                           // otherwise end...
 658     jmp(DONE_LABEL);                                                  // ... and we're done
 659     bind(L_regular_unlock);
 660   }
 661 #endif
 662 
 663   if (!UseHeavyMonitors) {
 664     cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD);                   // Examine the displaced header
 665     jcc   (Assembler::zero, DONE_LABEL);                              // 0 indicates recursive stack-lock
 666   }
 667   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
 668   if (!UseHeavyMonitors) {
 669     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 670     jccb  (Assembler::zero, Stacked);
 671   }
 672 
 673   // It's inflated.
 674 #if INCLUDE_RTM_OPT
 675   if (use_rtm) {
 676     Label L_regular_inflated_unlock;
 677     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 678     movptr(boxReg, Address(tmpReg, owner_offset));
 679     testptr(boxReg, boxReg);
 680     jccb(Assembler::notZero, L_regular_inflated_unlock);
 681     xend();
 682     jmpb(DONE_LABEL);
 683     bind(L_regular_inflated_unlock);
 684   }
 685 #endif
 686 
 687   // Despite our balanced locking property we still check that m->_owner == Self
 688   // as java routines or native JNI code called by this thread might
 689   // have released the lock.
 690   // Refer to the comments in synchronizer.cpp for how we might encode extra
 691   // state in _succ so we can avoid fetching EntryList|cxq.
 692   //
 693   // If there's no contention try a 1-0 exit.  That is, exit without
 694   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 695   // we detect and recover from the race that the 1-0 exit admits.
 696   //
 697   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 698   // before it STs null into _owner, releasing the lock.  Updates
 699   // to data protected by the critical section must be visible before
 700   // we drop the lock (and thus before any other thread could acquire
 701   // the lock and observe the fields protected by the lock).
 702   // IA32's memory-model is SPO, so STs are ordered with respect to
 703   // each other and there's no need for an explicit barrier (fence).
 704   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 705 #ifndef _LP64
 706   get_thread (boxReg);
 707 
 708   // Note that we could employ various encoding schemes to reduce
 709   // the number of loads below (currently 4) to just 2 or 3.
 710   // Refer to the comments in synchronizer.cpp.
 711   // In practice the chain of fetches doesn't seem to impact performance, however.
 712   xorptr(boxReg, boxReg);
 713   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 714   jccb  (Assembler::notZero, DONE_LABEL);
 715   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 716   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 717   jccb  (Assembler::notZero, CheckSucc);
 718   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 719   jmpb  (DONE_LABEL);
 720 
 721   bind (Stacked);
 722   // It's not inflated and it's not recursively stack-locked.
 723   // It must be stack-locked.
 724   // Try to reset the header to displaced header.
 725   // The "box" value on the stack is stable, so we can reload
 726   // and be assured we observe the same value as above.
 727   movptr(tmpReg, Address(boxReg, 0));
 728   lock();
 729   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 730   // Intention fall-thru into DONE_LABEL
 731 
 732   // DONE_LABEL is a hot target - we'd really like to place it at the
 733   // start of cache line by padding with NOPs.
 734   // See the AMD and Intel software optimization manuals for the
 735   // most efficient "long" NOP encodings.
 736   // Unfortunately none of our alignment mechanisms suffice.
 737   bind (CheckSucc);
 738 #else // _LP64
 739   // It's inflated
 740   Label LNotRecursive, LSuccess, LGoSlowPath;
 741 
 742   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 743   jccb(Assembler::equal, LNotRecursive);
 744 
 745   // Recursive inflated unlock
 746   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 747   jmpb(LSuccess);
 748 
 749   bind(LNotRecursive);
 750   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 751   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 752   jccb  (Assembler::notZero, CheckSucc);
 753   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 754   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 755   jmpb  (DONE_LABEL);
 756 
 757   // Try to avoid passing control into the slow_path ...
 758   bind  (CheckSucc);
 759 
 760   // The following optional optimization can be elided if necessary
 761   // Effectively: if (succ == null) goto slow path
 762   // The code reduces the window for a race, however,
 763   // and thus benefits performance.
 764   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 765   jccb  (Assembler::zero, LGoSlowPath);
 766 
 767   xorptr(boxReg, boxReg);
 768   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 769   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 770 
 771   // Memory barrier/fence
 772   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 773   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 774   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 775   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 776   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 777   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 778   lock(); addl(Address(rsp, 0), 0);
 779 
 780   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 781   jccb  (Assembler::notZero, LSuccess);
 782 
 783   // Rare inopportune interleaving - race.
 784   // The successor vanished in the small window above.
 785   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 786   // We need to ensure progress and succession.
 787   // Try to reacquire the lock.
 788   // If that fails then the new owner is responsible for succession and this
 789   // thread needs to take no further action and can exit via the fast path (success).
 790   // If the re-acquire succeeds then pass control into the slow path.
 791   // As implemented, this latter mode is horrible because we generated more
 792   // coherence traffic on the lock *and* artifically extended the critical section
 793   // length while by virtue of passing control into the slow path.
 794 
 795   // box is really RAX -- the following CMPXCHG depends on that binding
 796   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 797   lock();
 798   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 799   // There's no successor so we tried to regrab the lock.
 800   // If that didn't work, then another thread grabbed the
 801   // lock so we're done (and exit was a success).
 802   jccb  (Assembler::notEqual, LSuccess);
 803   // Intentional fall-through into slow path
 804 
 805   bind  (LGoSlowPath);
 806   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 807   jmpb  (DONE_LABEL);
 808 
 809   bind  (LSuccess);
 810   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 811   jmpb  (DONE_LABEL);
 812 
 813   if (!UseHeavyMonitors) {
 814     bind  (Stacked);
 815     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 816     lock();
 817     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 818   }
 819 #endif
 820   bind(DONE_LABEL);
 821 }
 822 
 823 //-------------------------------------------------------------------------------------------
 824 // Generic instructions support for use in .ad files C2 code generation
 825 
 826 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
 827   if (dst != src) {
 828     movdqu(dst, src);
 829   }
 830   if (opcode == Op_AbsVD) {
 831     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr);
 832   } else {
 833     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 834     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr);
 835   }
 836 }
 837 
 838 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
 839   if (opcode == Op_AbsVD) {
 840     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr);
 841   } else {
 842     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 843     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr);
 844   }
 845 }
 846 
 847 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
 848   if (dst != src) {
 849     movdqu(dst, src);
 850   }
 851   if (opcode == Op_AbsVF) {
 852     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);
 853   } else {
 854     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 855     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);
 856   }
 857 }
 858 
 859 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
 860   if (opcode == Op_AbsVF) {
 861     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);
 862   } else {
 863     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 864     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
 865   }
 866 }
 867 
 868 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 869   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 870   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 871 
 872   if (opcode == Op_MinV) {
 873     if (elem_bt == T_BYTE) {
 874       pminsb(dst, src);
 875     } else if (elem_bt == T_SHORT) {
 876       pminsw(dst, src);
 877     } else if (elem_bt == T_INT) {
 878       pminsd(dst, src);
 879     } else {
 880       assert(elem_bt == T_LONG, "required");
 881       assert(tmp == xmm0, "required");
 882       assert_different_registers(dst, src, tmp);
 883       movdqu(xmm0, dst);
 884       pcmpgtq(xmm0, src);
 885       blendvpd(dst, src);  // xmm0 as mask
 886     }
 887   } else { // opcode == Op_MaxV
 888     if (elem_bt == T_BYTE) {
 889       pmaxsb(dst, src);
 890     } else if (elem_bt == T_SHORT) {
 891       pmaxsw(dst, src);
 892     } else if (elem_bt == T_INT) {
 893       pmaxsd(dst, src);
 894     } else {
 895       assert(elem_bt == T_LONG, "required");
 896       assert(tmp == xmm0, "required");
 897       assert_different_registers(dst, src, tmp);
 898       movdqu(xmm0, src);
 899       pcmpgtq(xmm0, dst);
 900       blendvpd(dst, src);  // xmm0 as mask
 901     }
 902   }
 903 }
 904 
 905 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 906                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 907                                  int vlen_enc) {
 908   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 909 
 910   if (opcode == Op_MinV) {
 911     if (elem_bt == T_BYTE) {
 912       vpminsb(dst, src1, src2, vlen_enc);
 913     } else if (elem_bt == T_SHORT) {
 914       vpminsw(dst, src1, src2, vlen_enc);
 915     } else if (elem_bt == T_INT) {
 916       vpminsd(dst, src1, src2, vlen_enc);
 917     } else {
 918       assert(elem_bt == T_LONG, "required");
 919       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 920         vpminsq(dst, src1, src2, vlen_enc);
 921       } else {
 922         assert_different_registers(dst, src1, src2);
 923         vpcmpgtq(dst, src1, src2, vlen_enc);
 924         vblendvpd(dst, src1, src2, dst, vlen_enc);
 925       }
 926     }
 927   } else { // opcode == Op_MaxV
 928     if (elem_bt == T_BYTE) {
 929       vpmaxsb(dst, src1, src2, vlen_enc);
 930     } else if (elem_bt == T_SHORT) {
 931       vpmaxsw(dst, src1, src2, vlen_enc);
 932     } else if (elem_bt == T_INT) {
 933       vpmaxsd(dst, src1, src2, vlen_enc);
 934     } else {
 935       assert(elem_bt == T_LONG, "required");
 936       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 937         vpmaxsq(dst, src1, src2, vlen_enc);
 938       } else {
 939         assert_different_registers(dst, src1, src2);
 940         vpcmpgtq(dst, src1, src2, vlen_enc);
 941         vblendvpd(dst, src2, src1, dst, vlen_enc);
 942       }
 943     }
 944   }
 945 }
 946 
 947 // Float/Double min max
 948 
 949 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 950                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 951                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 952                                    int vlen_enc) {
 953   assert(UseAVX > 0, "required");
 954   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 955          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 956   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 957   assert_different_registers(a, b, tmp, atmp, btmp);
 958 
 959   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 960   bool is_double_word = is_double_word_type(elem_bt);
 961 
 962   if (!is_double_word && is_min) {
 963     vblendvps(atmp, a, b, a, vlen_enc);
 964     vblendvps(btmp, b, a, a, vlen_enc);
 965     vminps(tmp, atmp, btmp, vlen_enc);
 966     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 967     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
 968   } else if (!is_double_word && !is_min) {
 969     vblendvps(btmp, b, a, b, vlen_enc);
 970     vblendvps(atmp, a, b, b, vlen_enc);
 971     vmaxps(tmp, atmp, btmp, vlen_enc);
 972     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 973     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
 974   } else if (is_double_word && is_min) {
 975     vblendvpd(atmp, a, b, a, vlen_enc);
 976     vblendvpd(btmp, b, a, a, vlen_enc);
 977     vminpd(tmp, atmp, btmp, vlen_enc);
 978     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 979     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
 980   } else {
 981     assert(is_double_word && !is_min, "sanity");
 982     vblendvpd(btmp, b, a, b, vlen_enc);
 983     vblendvpd(atmp, a, b, b, vlen_enc);
 984     vmaxpd(tmp, atmp, btmp, vlen_enc);
 985     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 986     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
 987   }
 988 }
 989 
 990 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
 991                                     XMMRegister dst, XMMRegister a, XMMRegister b,
 992                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
 993                                     int vlen_enc) {
 994   assert(UseAVX > 2, "required");
 995   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 996          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 997   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 998   assert_different_registers(dst, a, b, atmp, btmp);
 999 
1000   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1001   bool is_double_word = is_double_word_type(elem_bt);
1002   bool merge = true;
1003 
1004   if (!is_double_word && is_min) {
1005     evpmovd2m(ktmp, a, vlen_enc);
1006     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1007     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1008     vminps(dst, atmp, btmp, vlen_enc);
1009     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1010     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1011   } else if (!is_double_word && !is_min) {
1012     evpmovd2m(ktmp, b, vlen_enc);
1013     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1014     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1015     vmaxps(dst, atmp, btmp, vlen_enc);
1016     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1017     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1018   } else if (is_double_word && is_min) {
1019     evpmovq2m(ktmp, a, vlen_enc);
1020     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1021     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1022     vminpd(dst, atmp, btmp, vlen_enc);
1023     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1024     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1025   } else {
1026     assert(is_double_word && !is_min, "sanity");
1027     evpmovq2m(ktmp, b, vlen_enc);
1028     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1029     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1030     vmaxpd(dst, atmp, btmp, vlen_enc);
1031     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1032     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1033   }
1034 }
1035 
1036 // Float/Double signum
1037 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst,
1038                                   XMMRegister zero, XMMRegister one,
1039                                   Register scratch) {
1040   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1041 
1042   Label DONE_LABEL;
1043 
1044   if (opcode == Op_SignumF) {
1045     assert(UseSSE > 0, "required");
1046     ucomiss(dst, zero);
1047     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1048     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1049     movflt(dst, one);
1050     jcc(Assembler::above, DONE_LABEL);
1051     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scratch);
1052   } else if (opcode == Op_SignumD) {
1053     assert(UseSSE > 1, "required");
1054     ucomisd(dst, zero);
1055     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1056     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1057     movdbl(dst, one);
1058     jcc(Assembler::above, DONE_LABEL);
1059     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scratch);
1060   }
1061 
1062   bind(DONE_LABEL);
1063 }
1064 
1065 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1066   if (sign) {
1067     pmovsxbw(dst, src);
1068   } else {
1069     pmovzxbw(dst, src);
1070   }
1071 }
1072 
1073 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1074   if (sign) {
1075     vpmovsxbw(dst, src, vector_len);
1076   } else {
1077     vpmovzxbw(dst, src, vector_len);
1078   }
1079 }
1080 
1081 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1082   if (sign) {
1083     vpmovsxbd(dst, src, vector_len);
1084   } else {
1085     vpmovzxbd(dst, src, vector_len);
1086   }
1087 }
1088 
1089 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1090   if (sign) {
1091     vpmovsxwd(dst, src, vector_len);
1092   } else {
1093     vpmovzxwd(dst, src, vector_len);
1094   }
1095 }
1096 
1097 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1098                                      int shift, int vector_len) {
1099   if (opcode == Op_RotateLeftV) {
1100     if (etype == T_INT) {
1101       evprold(dst, src, shift, vector_len);
1102     } else {
1103       assert(etype == T_LONG, "expected type T_LONG");
1104       evprolq(dst, src, shift, vector_len);
1105     }
1106   } else {
1107     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1108     if (etype == T_INT) {
1109       evprord(dst, src, shift, vector_len);
1110     } else {
1111       assert(etype == T_LONG, "expected type T_LONG");
1112       evprorq(dst, src, shift, vector_len);
1113     }
1114   }
1115 }
1116 
1117 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1118                                      XMMRegister shift, int vector_len) {
1119   if (opcode == Op_RotateLeftV) {
1120     if (etype == T_INT) {
1121       evprolvd(dst, src, shift, vector_len);
1122     } else {
1123       assert(etype == T_LONG, "expected type T_LONG");
1124       evprolvq(dst, src, shift, vector_len);
1125     }
1126   } else {
1127     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1128     if (etype == T_INT) {
1129       evprorvd(dst, src, shift, vector_len);
1130     } else {
1131       assert(etype == T_LONG, "expected type T_LONG");
1132       evprorvq(dst, src, shift, vector_len);
1133     }
1134   }
1135 }
1136 
1137 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1138   if (opcode == Op_RShiftVI) {
1139     psrad(dst, shift);
1140   } else if (opcode == Op_LShiftVI) {
1141     pslld(dst, shift);
1142   } else {
1143     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1144     psrld(dst, shift);
1145   }
1146 }
1147 
1148 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1149   switch (opcode) {
1150     case Op_RShiftVI:  psrad(dst, shift); break;
1151     case Op_LShiftVI:  pslld(dst, shift); break;
1152     case Op_URShiftVI: psrld(dst, shift); break;
1153 
1154     default: assert(false, "%s", NodeClassNames[opcode]);
1155   }
1156 }
1157 
1158 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1159   if (opcode == Op_RShiftVI) {
1160     vpsrad(dst, nds, shift, vector_len);
1161   } else if (opcode == Op_LShiftVI) {
1162     vpslld(dst, nds, shift, vector_len);
1163   } else {
1164     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1165     vpsrld(dst, nds, shift, vector_len);
1166   }
1167 }
1168 
1169 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1170   switch (opcode) {
1171     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1172     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1173     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1174 
1175     default: assert(false, "%s", NodeClassNames[opcode]);
1176   }
1177 }
1178 
1179 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1180   switch (opcode) {
1181     case Op_RShiftVB:  // fall-through
1182     case Op_RShiftVS:  psraw(dst, shift); break;
1183 
1184     case Op_LShiftVB:  // fall-through
1185     case Op_LShiftVS:  psllw(dst, shift);   break;
1186 
1187     case Op_URShiftVS: // fall-through
1188     case Op_URShiftVB: psrlw(dst, shift);  break;
1189 
1190     default: assert(false, "%s", NodeClassNames[opcode]);
1191   }
1192 }
1193 
1194 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1195   switch (opcode) {
1196     case Op_RShiftVB:  // fall-through
1197     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1198 
1199     case Op_LShiftVB:  // fall-through
1200     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1201 
1202     case Op_URShiftVS: // fall-through
1203     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1204 
1205     default: assert(false, "%s", NodeClassNames[opcode]);
1206   }
1207 }
1208 
1209 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1210   switch (opcode) {
1211     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1212     case Op_LShiftVL:  psllq(dst, shift); break;
1213     case Op_URShiftVL: psrlq(dst, shift); break;
1214 
1215     default: assert(false, "%s", NodeClassNames[opcode]);
1216   }
1217 }
1218 
1219 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1220   if (opcode == Op_RShiftVL) {
1221     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1222   } else if (opcode == Op_LShiftVL) {
1223     psllq(dst, shift);
1224   } else {
1225     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1226     psrlq(dst, shift);
1227   }
1228 }
1229 
1230 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1231   switch (opcode) {
1232     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1233     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1234     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1235 
1236     default: assert(false, "%s", NodeClassNames[opcode]);
1237   }
1238 }
1239 
1240 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1241   if (opcode == Op_RShiftVL) {
1242     evpsraq(dst, nds, shift, vector_len);
1243   } else if (opcode == Op_LShiftVL) {
1244     vpsllq(dst, nds, shift, vector_len);
1245   } else {
1246     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1247     vpsrlq(dst, nds, shift, vector_len);
1248   }
1249 }
1250 
1251 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1252   switch (opcode) {
1253     case Op_RShiftVB:  // fall-through
1254     case Op_RShiftVS:  // fall-through
1255     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1256 
1257     case Op_LShiftVB:  // fall-through
1258     case Op_LShiftVS:  // fall-through
1259     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1260 
1261     case Op_URShiftVB: // fall-through
1262     case Op_URShiftVS: // fall-through
1263     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1264 
1265     default: assert(false, "%s", NodeClassNames[opcode]);
1266   }
1267 }
1268 
1269 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1270   switch (opcode) {
1271     case Op_RShiftVB:  // fall-through
1272     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1273 
1274     case Op_LShiftVB:  // fall-through
1275     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1276 
1277     case Op_URShiftVB: // fall-through
1278     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1279 
1280     default: assert(false, "%s", NodeClassNames[opcode]);
1281   }
1282 }
1283 
1284 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1285   assert(UseAVX >= 2, "required");
1286   switch (opcode) {
1287     case Op_RShiftVL: {
1288       if (UseAVX > 2) {
1289         assert(tmp == xnoreg, "not used");
1290         if (!VM_Version::supports_avx512vl()) {
1291           vlen_enc = Assembler::AVX_512bit;
1292         }
1293         evpsravq(dst, src, shift, vlen_enc);
1294       } else {
1295         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1296         vpsrlvq(dst, src, shift, vlen_enc);
1297         vpsrlvq(tmp, tmp, shift, vlen_enc);
1298         vpxor(dst, dst, tmp, vlen_enc);
1299         vpsubq(dst, dst, tmp, vlen_enc);
1300       }
1301       break;
1302     }
1303     case Op_LShiftVL: {
1304       assert(tmp == xnoreg, "not used");
1305       vpsllvq(dst, src, shift, vlen_enc);
1306       break;
1307     }
1308     case Op_URShiftVL: {
1309       assert(tmp == xnoreg, "not used");
1310       vpsrlvq(dst, src, shift, vlen_enc);
1311       break;
1312     }
1313     default: assert(false, "%s", NodeClassNames[opcode]);
1314   }
1315 }
1316 
1317 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1318 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1319   assert(opcode == Op_LShiftVB ||
1320          opcode == Op_RShiftVB ||
1321          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1322   bool sign = (opcode != Op_URShiftVB);
1323   assert(vector_len == 0, "required");
1324   vextendbd(sign, dst, src, 1);
1325   vpmovzxbd(vtmp, shift, 1);
1326   varshiftd(opcode, dst, dst, vtmp, 1);
1327   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch);
1328   vextracti128_high(vtmp, dst);
1329   vpackusdw(dst, dst, vtmp, 0);
1330 }
1331 
1332 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1333 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1334   assert(opcode == Op_LShiftVB ||
1335          opcode == Op_RShiftVB ||
1336          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1337   bool sign = (opcode != Op_URShiftVB);
1338   int ext_vector_len = vector_len + 1;
1339   vextendbw(sign, dst, src, ext_vector_len);
1340   vpmovzxbw(vtmp, shift, ext_vector_len);
1341   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1342   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch);
1343   if (vector_len == 0) {
1344     vextracti128_high(vtmp, dst);
1345     vpackuswb(dst, dst, vtmp, vector_len);
1346   } else {
1347     vextracti64x4_high(vtmp, dst);
1348     vpackuswb(dst, dst, vtmp, vector_len);
1349     vpermq(dst, dst, 0xD8, vector_len);
1350   }
1351 }
1352 
1353 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1354   switch(typ) {
1355     case T_BYTE:
1356       pinsrb(dst, val, idx);
1357       break;
1358     case T_SHORT:
1359       pinsrw(dst, val, idx);
1360       break;
1361     case T_INT:
1362       pinsrd(dst, val, idx);
1363       break;
1364     case T_LONG:
1365       pinsrq(dst, val, idx);
1366       break;
1367     default:
1368       assert(false,"Should not reach here.");
1369       break;
1370   }
1371 }
1372 
1373 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1374   switch(typ) {
1375     case T_BYTE:
1376       vpinsrb(dst, src, val, idx);
1377       break;
1378     case T_SHORT:
1379       vpinsrw(dst, src, val, idx);
1380       break;
1381     case T_INT:
1382       vpinsrd(dst, src, val, idx);
1383       break;
1384     case T_LONG:
1385       vpinsrq(dst, src, val, idx);
1386       break;
1387     default:
1388       assert(false,"Should not reach here.");
1389       break;
1390   }
1391 }
1392 
1393 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1394   switch(typ) {
1395     case T_INT:
1396       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1397       break;
1398     case T_FLOAT:
1399       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1400       break;
1401     case T_LONG:
1402       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1403       break;
1404     case T_DOUBLE:
1405       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1406       break;
1407     default:
1408       assert(false,"Should not reach here.");
1409       break;
1410   }
1411 }
1412 
1413 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1414   switch(typ) {
1415     case T_INT:
1416       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1417       break;
1418     case T_FLOAT:
1419       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1420       break;
1421     case T_LONG:
1422       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1423       break;
1424     case T_DOUBLE:
1425       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1426       break;
1427     default:
1428       assert(false,"Should not reach here.");
1429       break;
1430   }
1431 }
1432 
1433 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1434   switch(typ) {
1435     case T_INT:
1436       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1437       break;
1438     case T_FLOAT:
1439       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1440       break;
1441     case T_LONG:
1442       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1443       break;
1444     case T_DOUBLE:
1445       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1446       break;
1447     default:
1448       assert(false,"Should not reach here.");
1449       break;
1450   }
1451 }
1452 
1453 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1454   if (vlen_in_bytes <= 16) {
1455     pxor (dst, dst);
1456     psubb(dst, src);
1457     switch (elem_bt) {
1458       case T_BYTE:   /* nothing to do */ break;
1459       case T_SHORT:  pmovsxbw(dst, dst); break;
1460       case T_INT:    pmovsxbd(dst, dst); break;
1461       case T_FLOAT:  pmovsxbd(dst, dst); break;
1462       case T_LONG:   pmovsxbq(dst, dst); break;
1463       case T_DOUBLE: pmovsxbq(dst, dst); break;
1464 
1465       default: assert(false, "%s", type2name(elem_bt));
1466     }
1467   } else {
1468     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1469     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1470 
1471     vpxor (dst, dst, dst, vlen_enc);
1472     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1473 
1474     switch (elem_bt) {
1475       case T_BYTE:   /* nothing to do */            break;
1476       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1477       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1478       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1479       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1480       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1481 
1482       default: assert(false, "%s", type2name(elem_bt));
1483     }
1484   }
1485 }
1486 
1487 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp,
1488                                          Register tmp, bool novlbwdq, int vlen_enc) {
1489   if (novlbwdq) {
1490     vpmovsxbd(xtmp, src, vlen_enc);
1491     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1492             Assembler::eq, true, vlen_enc, tmp);
1493   } else {
1494     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1495     vpsubb(xtmp, xtmp, src, vlen_enc);
1496     evpmovb2m(dst, xtmp, vlen_enc);
1497   }
1498 }
1499 
1500 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
1501   switch (vlen_in_bytes) {
1502   case 4:  movdl(dst, src);   break;
1503   case 8:  movq(dst, src);    break;
1504   case 16: movdqu(dst, src);  break;
1505   case 32: vmovdqu(dst, src); break;
1506   case 64: evmovdquq(dst, src, Assembler::AVX_512bit); break;
1507   default: ShouldNotReachHere();
1508   }
1509 }
1510 
1511 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1512   if (reachable(src)) {
1513     load_vector(dst, as_Address(src), vlen_in_bytes);
1514   } else {
1515     lea(rscratch, src);
1516     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
1517   }
1518 }
1519 
1520 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) {
1521   ExternalAddress addr(StubRoutines::x86::vector_iota_indices());
1522   if (vlen_in_bytes == 4) {
1523     movdl(dst, addr);
1524   } else if (vlen_in_bytes == 8) {
1525     movq(dst, addr);
1526   } else if (vlen_in_bytes == 16) {
1527     movdqu(dst, addr, scratch);
1528   } else if (vlen_in_bytes == 32) {
1529     vmovdqu(dst, addr, scratch);
1530   } else {
1531     assert(vlen_in_bytes == 64, "%d", vlen_in_bytes);
1532     evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch);
1533   }
1534 }
1535 
1536 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1537 
1538 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1539   int vector_len = Assembler::AVX_128bit;
1540 
1541   switch (opcode) {
1542     case Op_AndReductionV:  pand(dst, src); break;
1543     case Op_OrReductionV:   por (dst, src); break;
1544     case Op_XorReductionV:  pxor(dst, src); break;
1545     case Op_MinReductionV:
1546       switch (typ) {
1547         case T_BYTE:        pminsb(dst, src); break;
1548         case T_SHORT:       pminsw(dst, src); break;
1549         case T_INT:         pminsd(dst, src); break;
1550         case T_LONG:        assert(UseAVX > 2, "required");
1551                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1552         default:            assert(false, "wrong type");
1553       }
1554       break;
1555     case Op_MaxReductionV:
1556       switch (typ) {
1557         case T_BYTE:        pmaxsb(dst, src); break;
1558         case T_SHORT:       pmaxsw(dst, src); break;
1559         case T_INT:         pmaxsd(dst, src); break;
1560         case T_LONG:        assert(UseAVX > 2, "required");
1561                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1562         default:            assert(false, "wrong type");
1563       }
1564       break;
1565     case Op_AddReductionVF: addss(dst, src); break;
1566     case Op_AddReductionVD: addsd(dst, src); break;
1567     case Op_AddReductionVI:
1568       switch (typ) {
1569         case T_BYTE:        paddb(dst, src); break;
1570         case T_SHORT:       paddw(dst, src); break;
1571         case T_INT:         paddd(dst, src); break;
1572         default:            assert(false, "wrong type");
1573       }
1574       break;
1575     case Op_AddReductionVL: paddq(dst, src); break;
1576     case Op_MulReductionVF: mulss(dst, src); break;
1577     case Op_MulReductionVD: mulsd(dst, src); break;
1578     case Op_MulReductionVI:
1579       switch (typ) {
1580         case T_SHORT:       pmullw(dst, src); break;
1581         case T_INT:         pmulld(dst, src); break;
1582         default:            assert(false, "wrong type");
1583       }
1584       break;
1585     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1586                             vpmullq(dst, dst, src, vector_len); break;
1587     default:                assert(false, "wrong opcode");
1588   }
1589 }
1590 
1591 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1592   int vector_len = Assembler::AVX_256bit;
1593 
1594   switch (opcode) {
1595     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1596     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1597     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1598     case Op_MinReductionV:
1599       switch (typ) {
1600         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1601         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1602         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1603         case T_LONG:        assert(UseAVX > 2, "required");
1604                             vpminsq(dst, src1, src2, vector_len); break;
1605         default:            assert(false, "wrong type");
1606       }
1607       break;
1608     case Op_MaxReductionV:
1609       switch (typ) {
1610         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1611         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1612         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1613         case T_LONG:        assert(UseAVX > 2, "required");
1614                             vpmaxsq(dst, src1, src2, vector_len); break;
1615         default:            assert(false, "wrong type");
1616       }
1617       break;
1618     case Op_AddReductionVI:
1619       switch (typ) {
1620         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1621         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1622         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1623         default:            assert(false, "wrong type");
1624       }
1625       break;
1626     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1627     case Op_MulReductionVI:
1628       switch (typ) {
1629         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1630         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1631         default:            assert(false, "wrong type");
1632       }
1633       break;
1634     case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
1635     default:                assert(false, "wrong opcode");
1636   }
1637 }
1638 
1639 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1640                                   XMMRegister dst, XMMRegister src,
1641                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1642   switch (opcode) {
1643     case Op_AddReductionVF:
1644     case Op_MulReductionVF:
1645       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1646       break;
1647 
1648     case Op_AddReductionVD:
1649     case Op_MulReductionVD:
1650       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1651       break;
1652 
1653     default: assert(false, "wrong opcode");
1654   }
1655 }
1656 
1657 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1658                              Register dst, Register src1, XMMRegister src2,
1659                              XMMRegister vtmp1, XMMRegister vtmp2) {
1660   switch (vlen) {
1661     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1662     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1663     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1664     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1665 
1666     default: assert(false, "wrong vector length");
1667   }
1668 }
1669 
1670 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1671                              Register dst, Register src1, XMMRegister src2,
1672                              XMMRegister vtmp1, XMMRegister vtmp2) {
1673   switch (vlen) {
1674     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1675     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1676     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1677     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1678 
1679     default: assert(false, "wrong vector length");
1680   }
1681 }
1682 
1683 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1684                              Register dst, Register src1, XMMRegister src2,
1685                              XMMRegister vtmp1, XMMRegister vtmp2) {
1686   switch (vlen) {
1687     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1688     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1689     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1690     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1691 
1692     default: assert(false, "wrong vector length");
1693   }
1694 }
1695 
1696 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1697                              Register dst, Register src1, XMMRegister src2,
1698                              XMMRegister vtmp1, XMMRegister vtmp2) {
1699   switch (vlen) {
1700     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1701     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1702     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1703     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1704 
1705     default: assert(false, "wrong vector length");
1706   }
1707 }
1708 
1709 #ifdef _LP64
1710 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1711                              Register dst, Register src1, XMMRegister src2,
1712                              XMMRegister vtmp1, XMMRegister vtmp2) {
1713   switch (vlen) {
1714     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1715     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1716     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1717 
1718     default: assert(false, "wrong vector length");
1719   }
1720 }
1721 #endif // _LP64
1722 
1723 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1724   switch (vlen) {
1725     case 2:
1726       assert(vtmp2 == xnoreg, "");
1727       reduce2F(opcode, dst, src, vtmp1);
1728       break;
1729     case 4:
1730       assert(vtmp2 == xnoreg, "");
1731       reduce4F(opcode, dst, src, vtmp1);
1732       break;
1733     case 8:
1734       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1735       break;
1736     case 16:
1737       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1738       break;
1739     default: assert(false, "wrong vector length");
1740   }
1741 }
1742 
1743 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1744   switch (vlen) {
1745     case 2:
1746       assert(vtmp2 == xnoreg, "");
1747       reduce2D(opcode, dst, src, vtmp1);
1748       break;
1749     case 4:
1750       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1751       break;
1752     case 8:
1753       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1754       break;
1755     default: assert(false, "wrong vector length");
1756   }
1757 }
1758 
1759 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1760   if (opcode == Op_AddReductionVI) {
1761     if (vtmp1 != src2) {
1762       movdqu(vtmp1, src2);
1763     }
1764     phaddd(vtmp1, vtmp1);
1765   } else {
1766     pshufd(vtmp1, src2, 0x1);
1767     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1768   }
1769   movdl(vtmp2, src1);
1770   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1771   movdl(dst, vtmp1);
1772 }
1773 
1774 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1775   if (opcode == Op_AddReductionVI) {
1776     if (vtmp1 != src2) {
1777       movdqu(vtmp1, src2);
1778     }
1779     phaddd(vtmp1, src2);
1780     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1781   } else {
1782     pshufd(vtmp2, src2, 0xE);
1783     reduce_operation_128(T_INT, opcode, vtmp2, src2);
1784     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1785   }
1786 }
1787 
1788 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1789   if (opcode == Op_AddReductionVI) {
1790     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1791     vextracti128_high(vtmp2, vtmp1);
1792     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1793     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1794   } else {
1795     vextracti128_high(vtmp1, src2);
1796     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1797     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1798   }
1799 }
1800 
1801 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1802   vextracti64x4_high(vtmp2, src2);
1803   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
1804   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1805 }
1806 
1807 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1808   pshufd(vtmp2, src2, 0x1);
1809   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1810   movdqu(vtmp1, vtmp2);
1811   psrldq(vtmp1, 2);
1812   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1813   movdqu(vtmp2, vtmp1);
1814   psrldq(vtmp2, 1);
1815   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1816   movdl(vtmp2, src1);
1817   pmovsxbd(vtmp1, vtmp1);
1818   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1819   pextrb(dst, vtmp1, 0x0);
1820   movsbl(dst, dst);
1821 }
1822 
1823 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1824   pshufd(vtmp1, src2, 0xE);
1825   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
1826   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1827 }
1828 
1829 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1830   vextracti128_high(vtmp2, src2);
1831   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1832   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1833 }
1834 
1835 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1836   vextracti64x4_high(vtmp1, src2);
1837   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
1838   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1839 }
1840 
1841 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1842   pmovsxbw(vtmp2, src2);
1843   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1844 }
1845 
1846 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1847   if (UseAVX > 1) {
1848     int vector_len = Assembler::AVX_256bit;
1849     vpmovsxbw(vtmp1, src2, vector_len);
1850     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1851   } else {
1852     pmovsxbw(vtmp2, src2);
1853     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1854     pshufd(vtmp2, src2, 0x1);
1855     pmovsxbw(vtmp2, src2);
1856     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1857   }
1858 }
1859 
1860 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1861   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
1862     int vector_len = Assembler::AVX_512bit;
1863     vpmovsxbw(vtmp1, src2, vector_len);
1864     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1865   } else {
1866     assert(UseAVX >= 2,"Should not reach here.");
1867     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
1868     vextracti128_high(vtmp2, src2);
1869     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1870   }
1871 }
1872 
1873 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1874   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
1875   vextracti64x4_high(vtmp2, src2);
1876   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1877 }
1878 
1879 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1880   if (opcode == Op_AddReductionVI) {
1881     if (vtmp1 != src2) {
1882       movdqu(vtmp1, src2);
1883     }
1884     phaddw(vtmp1, vtmp1);
1885     phaddw(vtmp1, vtmp1);
1886   } else {
1887     pshufd(vtmp2, src2, 0x1);
1888     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1889     movdqu(vtmp1, vtmp2);
1890     psrldq(vtmp1, 2);
1891     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
1892   }
1893   movdl(vtmp2, src1);
1894   pmovsxwd(vtmp1, vtmp1);
1895   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1896   pextrw(dst, vtmp1, 0x0);
1897   movswl(dst, dst);
1898 }
1899 
1900 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1901   if (opcode == Op_AddReductionVI) {
1902     if (vtmp1 != src2) {
1903       movdqu(vtmp1, src2);
1904     }
1905     phaddw(vtmp1, src2);
1906   } else {
1907     pshufd(vtmp1, src2, 0xE);
1908     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
1909   }
1910   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1911 }
1912 
1913 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1914   if (opcode == Op_AddReductionVI) {
1915     int vector_len = Assembler::AVX_256bit;
1916     vphaddw(vtmp2, src2, src2, vector_len);
1917     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
1918   } else {
1919     vextracti128_high(vtmp2, src2);
1920     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1921   }
1922   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1923 }
1924 
1925 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1926   int vector_len = Assembler::AVX_256bit;
1927   vextracti64x4_high(vtmp1, src2);
1928   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
1929   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1930 }
1931 
1932 #ifdef _LP64
1933 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1934   pshufd(vtmp2, src2, 0xE);
1935   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
1936   movdq(vtmp1, src1);
1937   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
1938   movdq(dst, vtmp1);
1939 }
1940 
1941 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1942   vextracti128_high(vtmp1, src2);
1943   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
1944   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1945 }
1946 
1947 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1948   vextracti64x4_high(vtmp2, src2);
1949   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
1950   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1951 }
1952 
1953 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
1954   assert(ArrayOperationPartialInlineSize > 0 && ArrayOperationPartialInlineSize <= 64, "invalid");
1955   mov64(temp, -1L);
1956   bzhiq(temp, temp, len);
1957   kmovql(dst, temp);
1958 }
1959 #endif // _LP64
1960 
1961 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1962   reduce_operation_128(T_FLOAT, opcode, dst, src);
1963   pshufd(vtmp, src, 0x1);
1964   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1965 }
1966 
1967 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1968   reduce2F(opcode, dst, src, vtmp);
1969   pshufd(vtmp, src, 0x2);
1970   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1971   pshufd(vtmp, src, 0x3);
1972   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1973 }
1974 
1975 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1976   reduce4F(opcode, dst, src, vtmp2);
1977   vextractf128_high(vtmp2, src);
1978   reduce4F(opcode, dst, vtmp2, vtmp1);
1979 }
1980 
1981 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1982   reduce8F(opcode, dst, src, vtmp1, vtmp2);
1983   vextracti64x4_high(vtmp1, src);
1984   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
1985 }
1986 
1987 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1988   reduce_operation_128(T_DOUBLE, opcode, dst, src);
1989   pshufd(vtmp, src, 0xE);
1990   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
1991 }
1992 
1993 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1994   reduce2D(opcode, dst, src, vtmp2);
1995   vextractf128_high(vtmp2, src);
1996   reduce2D(opcode, dst, vtmp2, vtmp1);
1997 }
1998 
1999 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2000   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2001   vextracti64x4_high(vtmp1, src);
2002   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2003 }
2004 
2005 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) {
2006   MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
2007 }
2008 
2009 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) {
2010   MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
2011 }
2012 
2013 
2014 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2015                                           XMMRegister dst, XMMRegister src,
2016                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2017                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2018   int permconst[] = {1, 14};
2019   XMMRegister wsrc = src;
2020   XMMRegister wdst = xmm_0;
2021   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2022 
2023   int vlen_enc = Assembler::AVX_128bit;
2024   if (vlen == 16) {
2025     vlen_enc = Assembler::AVX_256bit;
2026   }
2027 
2028   for (int i = log2(vlen) - 1; i >=0; i--) {
2029     if (i == 0 && !is_dst_valid) {
2030       wdst = dst;
2031     }
2032     if (i == 3) {
2033       vextracti64x4_high(wtmp, wsrc);
2034     } else if (i == 2) {
2035       vextracti128_high(wtmp, wsrc);
2036     } else { // i = [0,1]
2037       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2038     }
2039     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2040     wsrc = wdst;
2041     vlen_enc = Assembler::AVX_128bit;
2042   }
2043   if (is_dst_valid) {
2044     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2045   }
2046 }
2047 
2048 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2049                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2050                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2051   XMMRegister wsrc = src;
2052   XMMRegister wdst = xmm_0;
2053   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2054   int vlen_enc = Assembler::AVX_128bit;
2055   if (vlen == 8) {
2056     vlen_enc = Assembler::AVX_256bit;
2057   }
2058   for (int i = log2(vlen) - 1; i >=0; i--) {
2059     if (i == 0 && !is_dst_valid) {
2060       wdst = dst;
2061     }
2062     if (i == 1) {
2063       vextracti128_high(wtmp, wsrc);
2064     } else if (i == 2) {
2065       vextracti64x4_high(wtmp, wsrc);
2066     } else {
2067       assert(i == 0, "%d", i);
2068       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2069     }
2070     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2071     wsrc = wdst;
2072     vlen_enc = Assembler::AVX_128bit;
2073   }
2074   if (is_dst_valid) {
2075     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2076   }
2077 }
2078 
2079 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2080   switch (bt) {
2081     case T_BYTE:  pextrb(dst, src, idx); break;
2082     case T_SHORT: pextrw(dst, src, idx); break;
2083     case T_INT:   pextrd(dst, src, idx); break;
2084     case T_LONG:  pextrq(dst, src, idx); break;
2085 
2086     default:
2087       assert(false,"Should not reach here.");
2088       break;
2089   }
2090 }
2091 
2092 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2093   int esize =  type2aelembytes(typ);
2094   int elem_per_lane = 16/esize;
2095   int lane = elemindex / elem_per_lane;
2096   int eindex = elemindex % elem_per_lane;
2097 
2098   if (lane >= 2) {
2099     assert(UseAVX > 2, "required");
2100     vextractf32x4(dst, src, lane & 3);
2101     return dst;
2102   } else if (lane > 0) {
2103     assert(UseAVX > 0, "required");
2104     vextractf128(dst, src, lane);
2105     return dst;
2106   } else {
2107     return src;
2108   }
2109 }
2110 
2111 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2112   int esize =  type2aelembytes(typ);
2113   int elem_per_lane = 16/esize;
2114   int eindex = elemindex % elem_per_lane;
2115   assert(is_integral_type(typ),"required");
2116 
2117   if (eindex == 0) {
2118     if (typ == T_LONG) {
2119       movq(dst, src);
2120     } else {
2121       movdl(dst, src);
2122       if (typ == T_BYTE)
2123         movsbl(dst, dst);
2124       else if (typ == T_SHORT)
2125         movswl(dst, dst);
2126     }
2127   } else {
2128     extract(typ, dst, src, eindex);
2129   }
2130 }
2131 
2132 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) {
2133   int esize =  type2aelembytes(typ);
2134   int elem_per_lane = 16/esize;
2135   int eindex = elemindex % elem_per_lane;
2136   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2137 
2138   if (eindex == 0) {
2139     movq(dst, src);
2140   } else {
2141     if (typ == T_FLOAT) {
2142       if (UseAVX == 0) {
2143         movdqu(dst, src);
2144         pshufps(dst, dst, eindex);
2145       } else {
2146         vpshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2147       }
2148     } else {
2149       if (UseAVX == 0) {
2150         movdqu(dst, src);
2151         psrldq(dst, eindex*esize);
2152       } else {
2153         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2154       }
2155       movq(dst, dst);
2156     }
2157   }
2158   // Zero upper bits
2159   if (typ == T_FLOAT) {
2160     if (UseAVX == 0) {
2161       assert((vtmp != xnoreg) && (tmp != noreg), "required.");
2162       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp);
2163       pand(dst, vtmp);
2164     } else {
2165       assert((tmp != noreg), "required.");
2166       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp);
2167     }
2168   }
2169 }
2170 
2171 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2172   switch(typ) {
2173     case T_BYTE:
2174     case T_BOOLEAN:
2175       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2176       break;
2177     case T_SHORT:
2178     case T_CHAR:
2179       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2180       break;
2181     case T_INT:
2182     case T_FLOAT:
2183       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2184       break;
2185     case T_LONG:
2186     case T_DOUBLE:
2187       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2188       break;
2189     default:
2190       assert(false,"Should not reach here.");
2191       break;
2192   }
2193 }
2194 
2195 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) {
2196   switch(typ) {
2197     case T_BOOLEAN:
2198     case T_BYTE:
2199       evpcmpb(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2200       break;
2201     case T_CHAR:
2202     case T_SHORT:
2203       evpcmpw(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2204       break;
2205     case T_INT:
2206     case T_FLOAT:
2207       evpcmpd(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2208       break;
2209     case T_LONG:
2210     case T_DOUBLE:
2211       evpcmpq(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2212       break;
2213     default:
2214       assert(false,"Should not reach here.");
2215       break;
2216   }
2217 }
2218 
2219 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2220   switch(typ) {
2221     case T_BYTE:
2222       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2223       break;
2224     case T_SHORT:
2225       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2226       break;
2227     case T_INT:
2228     case T_FLOAT:
2229       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2230       break;
2231     case T_LONG:
2232     case T_DOUBLE:
2233       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2234       break;
2235     default:
2236       assert(false,"Should not reach here.");
2237       break;
2238   }
2239 }
2240 
2241 void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2,
2242                                    XMMRegister vtmp1, XMMRegister vtmp2, KRegister mask) {
2243   switch(vlen) {
2244     case 4:
2245       assert(vtmp1 != xnoreg, "required.");
2246       // Broadcast lower 32 bits to 128 bits before ptest
2247       pshufd(vtmp1, src1, 0x0);
2248       if (bt == BoolTest::overflow) {
2249         assert(vtmp2 != xnoreg, "required.");
2250         pshufd(vtmp2, src2, 0x0);
2251       } else {
2252         assert(vtmp2 == xnoreg, "required.");
2253         vtmp2 = src2;
2254       }
2255       ptest(vtmp1, vtmp2);
2256      break;
2257     case 8:
2258       assert(vtmp1 != xnoreg, "required.");
2259       // Broadcast lower 64 bits to 128 bits before ptest
2260       pshufd(vtmp1, src1, 0x4);
2261       if (bt == BoolTest::overflow) {
2262         assert(vtmp2 != xnoreg, "required.");
2263         pshufd(vtmp2, src2, 0x4);
2264       } else {
2265         assert(vtmp2 == xnoreg, "required.");
2266         vtmp2 = src2;
2267       }
2268       ptest(vtmp1, vtmp2);
2269      break;
2270     case 16:
2271       assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2272       ptest(src1, src2);
2273       break;
2274     case 32:
2275       assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2276       vptest(src1, src2, Assembler::AVX_256bit);
2277       break;
2278     case 64:
2279       {
2280         assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2281         evpcmpeqb(mask, src1, src2, Assembler::AVX_512bit);
2282         if (bt == BoolTest::ne) {
2283           ktestql(mask, mask);
2284         } else {
2285           assert(bt == BoolTest::overflow, "required");
2286           kortestql(mask, mask);
2287         }
2288       }
2289       break;
2290     default:
2291       assert(false,"Should not reach here.");
2292       break;
2293   }
2294 }
2295 
2296 //-------------------------------------------------------------------------------------------
2297 
2298 // IndexOf for constant substrings with size >= 8 chars
2299 // which don't need to be loaded through stack.
2300 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2301                                          Register cnt1, Register cnt2,
2302                                          int int_cnt2,  Register result,
2303                                          XMMRegister vec, Register tmp,
2304                                          int ae) {
2305   ShortBranchVerifier sbv(this);
2306   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2307   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2308 
2309   // This method uses the pcmpestri instruction with bound registers
2310   //   inputs:
2311   //     xmm - substring
2312   //     rax - substring length (elements count)
2313   //     mem - scanned string
2314   //     rdx - string length (elements count)
2315   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2316   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2317   //   outputs:
2318   //     rcx - matched index in string
2319   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2320   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2321   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2322   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2323   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2324 
2325   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2326         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2327         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2328 
2329   // Note, inline_string_indexOf() generates checks:
2330   // if (substr.count > string.count) return -1;
2331   // if (substr.count == 0) return 0;
2332   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2333 
2334   // Load substring.
2335   if (ae == StrIntrinsicNode::UL) {
2336     pmovzxbw(vec, Address(str2, 0));
2337   } else {
2338     movdqu(vec, Address(str2, 0));
2339   }
2340   movl(cnt2, int_cnt2);
2341   movptr(result, str1); // string addr
2342 
2343   if (int_cnt2 > stride) {
2344     jmpb(SCAN_TO_SUBSTR);
2345 
2346     // Reload substr for rescan, this code
2347     // is executed only for large substrings (> 8 chars)
2348     bind(RELOAD_SUBSTR);
2349     if (ae == StrIntrinsicNode::UL) {
2350       pmovzxbw(vec, Address(str2, 0));
2351     } else {
2352       movdqu(vec, Address(str2, 0));
2353     }
2354     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2355 
2356     bind(RELOAD_STR);
2357     // We came here after the beginning of the substring was
2358     // matched but the rest of it was not so we need to search
2359     // again. Start from the next element after the previous match.
2360 
2361     // cnt2 is number of substring reminding elements and
2362     // cnt1 is number of string reminding elements when cmp failed.
2363     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2364     subl(cnt1, cnt2);
2365     addl(cnt1, int_cnt2);
2366     movl(cnt2, int_cnt2); // Now restore cnt2
2367 
2368     decrementl(cnt1);     // Shift to next element
2369     cmpl(cnt1, cnt2);
2370     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2371 
2372     addptr(result, (1<<scale1));
2373 
2374   } // (int_cnt2 > 8)
2375 
2376   // Scan string for start of substr in 16-byte vectors
2377   bind(SCAN_TO_SUBSTR);
2378   pcmpestri(vec, Address(result, 0), mode);
2379   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2380   subl(cnt1, stride);
2381   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2382   cmpl(cnt1, cnt2);
2383   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2384   addptr(result, 16);
2385   jmpb(SCAN_TO_SUBSTR);
2386 
2387   // Found a potential substr
2388   bind(FOUND_CANDIDATE);
2389   // Matched whole vector if first element matched (tmp(rcx) == 0).
2390   if (int_cnt2 == stride) {
2391     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2392   } else { // int_cnt2 > 8
2393     jccb(Assembler::overflow, FOUND_SUBSTR);
2394   }
2395   // After pcmpestri tmp(rcx) contains matched element index
2396   // Compute start addr of substr
2397   lea(result, Address(result, tmp, scale1));
2398 
2399   // Make sure string is still long enough
2400   subl(cnt1, tmp);
2401   cmpl(cnt1, cnt2);
2402   if (int_cnt2 == stride) {
2403     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2404   } else { // int_cnt2 > 8
2405     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2406   }
2407   // Left less then substring.
2408 
2409   bind(RET_NOT_FOUND);
2410   movl(result, -1);
2411   jmp(EXIT);
2412 
2413   if (int_cnt2 > stride) {
2414     // This code is optimized for the case when whole substring
2415     // is matched if its head is matched.
2416     bind(MATCH_SUBSTR_HEAD);
2417     pcmpestri(vec, Address(result, 0), mode);
2418     // Reload only string if does not match
2419     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2420 
2421     Label CONT_SCAN_SUBSTR;
2422     // Compare the rest of substring (> 8 chars).
2423     bind(FOUND_SUBSTR);
2424     // First 8 chars are already matched.
2425     negptr(cnt2);
2426     addptr(cnt2, stride);
2427 
2428     bind(SCAN_SUBSTR);
2429     subl(cnt1, stride);
2430     cmpl(cnt2, -stride); // Do not read beyond substring
2431     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2432     // Back-up strings to avoid reading beyond substring:
2433     // cnt1 = cnt1 - cnt2 + 8
2434     addl(cnt1, cnt2); // cnt2 is negative
2435     addl(cnt1, stride);
2436     movl(cnt2, stride); negptr(cnt2);
2437     bind(CONT_SCAN_SUBSTR);
2438     if (int_cnt2 < (int)G) {
2439       int tail_off1 = int_cnt2<<scale1;
2440       int tail_off2 = int_cnt2<<scale2;
2441       if (ae == StrIntrinsicNode::UL) {
2442         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2443       } else {
2444         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2445       }
2446       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2447     } else {
2448       // calculate index in register to avoid integer overflow (int_cnt2*2)
2449       movl(tmp, int_cnt2);
2450       addptr(tmp, cnt2);
2451       if (ae == StrIntrinsicNode::UL) {
2452         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2453       } else {
2454         movdqu(vec, Address(str2, tmp, scale2, 0));
2455       }
2456       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2457     }
2458     // Need to reload strings pointers if not matched whole vector
2459     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2460     addptr(cnt2, stride);
2461     jcc(Assembler::negative, SCAN_SUBSTR);
2462     // Fall through if found full substring
2463 
2464   } // (int_cnt2 > 8)
2465 
2466   bind(RET_FOUND);
2467   // Found result if we matched full small substring.
2468   // Compute substr offset
2469   subptr(result, str1);
2470   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2471     shrl(result, 1); // index
2472   }
2473   bind(EXIT);
2474 
2475 } // string_indexofC8
2476 
2477 // Small strings are loaded through stack if they cross page boundary.
2478 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2479                                        Register cnt1, Register cnt2,
2480                                        int int_cnt2,  Register result,
2481                                        XMMRegister vec, Register tmp,
2482                                        int ae) {
2483   ShortBranchVerifier sbv(this);
2484   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2485   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2486 
2487   //
2488   // int_cnt2 is length of small (< 8 chars) constant substring
2489   // or (-1) for non constant substring in which case its length
2490   // is in cnt2 register.
2491   //
2492   // Note, inline_string_indexOf() generates checks:
2493   // if (substr.count > string.count) return -1;
2494   // if (substr.count == 0) return 0;
2495   //
2496   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2497   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2498   // This method uses the pcmpestri instruction with bound registers
2499   //   inputs:
2500   //     xmm - substring
2501   //     rax - substring length (elements count)
2502   //     mem - scanned string
2503   //     rdx - string length (elements count)
2504   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2505   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2506   //   outputs:
2507   //     rcx - matched index in string
2508   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2509   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2510   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2511   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2512 
2513   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2514         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2515         FOUND_CANDIDATE;
2516 
2517   { //========================================================
2518     // We don't know where these strings are located
2519     // and we can't read beyond them. Load them through stack.
2520     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2521 
2522     movptr(tmp, rsp); // save old SP
2523 
2524     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2525       if (int_cnt2 == (1>>scale2)) { // One byte
2526         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2527         load_unsigned_byte(result, Address(str2, 0));
2528         movdl(vec, result); // move 32 bits
2529       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2530         // Not enough header space in 32-bit VM: 12+3 = 15.
2531         movl(result, Address(str2, -1));
2532         shrl(result, 8);
2533         movdl(vec, result); // move 32 bits
2534       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2535         load_unsigned_short(result, Address(str2, 0));
2536         movdl(vec, result); // move 32 bits
2537       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2538         movdl(vec, Address(str2, 0)); // move 32 bits
2539       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2540         movq(vec, Address(str2, 0));  // move 64 bits
2541       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2542         // Array header size is 12 bytes in 32-bit VM
2543         // + 6 bytes for 3 chars == 18 bytes,
2544         // enough space to load vec and shift.
2545         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2546         if (ae == StrIntrinsicNode::UL) {
2547           int tail_off = int_cnt2-8;
2548           pmovzxbw(vec, Address(str2, tail_off));
2549           psrldq(vec, -2*tail_off);
2550         }
2551         else {
2552           int tail_off = int_cnt2*(1<<scale2);
2553           movdqu(vec, Address(str2, tail_off-16));
2554           psrldq(vec, 16-tail_off);
2555         }
2556       }
2557     } else { // not constant substring
2558       cmpl(cnt2, stride);
2559       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2560 
2561       // We can read beyond string if srt+16 does not cross page boundary
2562       // since heaps are aligned and mapped by pages.
2563       assert(os::vm_page_size() < (int)G, "default page should be small");
2564       movl(result, str2); // We need only low 32 bits
2565       andl(result, (os::vm_page_size()-1));
2566       cmpl(result, (os::vm_page_size()-16));
2567       jccb(Assembler::belowEqual, CHECK_STR);
2568 
2569       // Move small strings to stack to allow load 16 bytes into vec.
2570       subptr(rsp, 16);
2571       int stk_offset = wordSize-(1<<scale2);
2572       push(cnt2);
2573 
2574       bind(COPY_SUBSTR);
2575       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2576         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2577         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2578       } else if (ae == StrIntrinsicNode::UU) {
2579         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2580         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2581       }
2582       decrement(cnt2);
2583       jccb(Assembler::notZero, COPY_SUBSTR);
2584 
2585       pop(cnt2);
2586       movptr(str2, rsp);  // New substring address
2587     } // non constant
2588 
2589     bind(CHECK_STR);
2590     cmpl(cnt1, stride);
2591     jccb(Assembler::aboveEqual, BIG_STRINGS);
2592 
2593     // Check cross page boundary.
2594     movl(result, str1); // We need only low 32 bits
2595     andl(result, (os::vm_page_size()-1));
2596     cmpl(result, (os::vm_page_size()-16));
2597     jccb(Assembler::belowEqual, BIG_STRINGS);
2598 
2599     subptr(rsp, 16);
2600     int stk_offset = -(1<<scale1);
2601     if (int_cnt2 < 0) { // not constant
2602       push(cnt2);
2603       stk_offset += wordSize;
2604     }
2605     movl(cnt2, cnt1);
2606 
2607     bind(COPY_STR);
2608     if (ae == StrIntrinsicNode::LL) {
2609       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2610       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2611     } else {
2612       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2613       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2614     }
2615     decrement(cnt2);
2616     jccb(Assembler::notZero, COPY_STR);
2617 
2618     if (int_cnt2 < 0) { // not constant
2619       pop(cnt2);
2620     }
2621     movptr(str1, rsp);  // New string address
2622 
2623     bind(BIG_STRINGS);
2624     // Load substring.
2625     if (int_cnt2 < 0) { // -1
2626       if (ae == StrIntrinsicNode::UL) {
2627         pmovzxbw(vec, Address(str2, 0));
2628       } else {
2629         movdqu(vec, Address(str2, 0));
2630       }
2631       push(cnt2);       // substr count
2632       push(str2);       // substr addr
2633       push(str1);       // string addr
2634     } else {
2635       // Small (< 8 chars) constant substrings are loaded already.
2636       movl(cnt2, int_cnt2);
2637     }
2638     push(tmp);  // original SP
2639 
2640   } // Finished loading
2641 
2642   //========================================================
2643   // Start search
2644   //
2645 
2646   movptr(result, str1); // string addr
2647 
2648   if (int_cnt2  < 0) {  // Only for non constant substring
2649     jmpb(SCAN_TO_SUBSTR);
2650 
2651     // SP saved at sp+0
2652     // String saved at sp+1*wordSize
2653     // Substr saved at sp+2*wordSize
2654     // Substr count saved at sp+3*wordSize
2655 
2656     // Reload substr for rescan, this code
2657     // is executed only for large substrings (> 8 chars)
2658     bind(RELOAD_SUBSTR);
2659     movptr(str2, Address(rsp, 2*wordSize));
2660     movl(cnt2, Address(rsp, 3*wordSize));
2661     if (ae == StrIntrinsicNode::UL) {
2662       pmovzxbw(vec, Address(str2, 0));
2663     } else {
2664       movdqu(vec, Address(str2, 0));
2665     }
2666     // We came here after the beginning of the substring was
2667     // matched but the rest of it was not so we need to search
2668     // again. Start from the next element after the previous match.
2669     subptr(str1, result); // Restore counter
2670     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2671       shrl(str1, 1);
2672     }
2673     addl(cnt1, str1);
2674     decrementl(cnt1);   // Shift to next element
2675     cmpl(cnt1, cnt2);
2676     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2677 
2678     addptr(result, (1<<scale1));
2679   } // non constant
2680 
2681   // Scan string for start of substr in 16-byte vectors
2682   bind(SCAN_TO_SUBSTR);
2683   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2684   pcmpestri(vec, Address(result, 0), mode);
2685   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2686   subl(cnt1, stride);
2687   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2688   cmpl(cnt1, cnt2);
2689   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2690   addptr(result, 16);
2691 
2692   bind(ADJUST_STR);
2693   cmpl(cnt1, stride); // Do not read beyond string
2694   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2695   // Back-up string to avoid reading beyond string.
2696   lea(result, Address(result, cnt1, scale1, -16));
2697   movl(cnt1, stride);
2698   jmpb(SCAN_TO_SUBSTR);
2699 
2700   // Found a potential substr
2701   bind(FOUND_CANDIDATE);
2702   // After pcmpestri tmp(rcx) contains matched element index
2703 
2704   // Make sure string is still long enough
2705   subl(cnt1, tmp);
2706   cmpl(cnt1, cnt2);
2707   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
2708   // Left less then substring.
2709 
2710   bind(RET_NOT_FOUND);
2711   movl(result, -1);
2712   jmp(CLEANUP);
2713 
2714   bind(FOUND_SUBSTR);
2715   // Compute start addr of substr
2716   lea(result, Address(result, tmp, scale1));
2717   if (int_cnt2 > 0) { // Constant substring
2718     // Repeat search for small substring (< 8 chars)
2719     // from new point without reloading substring.
2720     // Have to check that we don't read beyond string.
2721     cmpl(tmp, stride-int_cnt2);
2722     jccb(Assembler::greater, ADJUST_STR);
2723     // Fall through if matched whole substring.
2724   } else { // non constant
2725     assert(int_cnt2 == -1, "should be != 0");
2726 
2727     addl(tmp, cnt2);
2728     // Found result if we matched whole substring.
2729     cmpl(tmp, stride);
2730     jcc(Assembler::lessEqual, RET_FOUND);
2731 
2732     // Repeat search for small substring (<= 8 chars)
2733     // from new point 'str1' without reloading substring.
2734     cmpl(cnt2, stride);
2735     // Have to check that we don't read beyond string.
2736     jccb(Assembler::lessEqual, ADJUST_STR);
2737 
2738     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
2739     // Compare the rest of substring (> 8 chars).
2740     movptr(str1, result);
2741 
2742     cmpl(tmp, cnt2);
2743     // First 8 chars are already matched.
2744     jccb(Assembler::equal, CHECK_NEXT);
2745 
2746     bind(SCAN_SUBSTR);
2747     pcmpestri(vec, Address(str1, 0), mode);
2748     // Need to reload strings pointers if not matched whole vector
2749     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2750 
2751     bind(CHECK_NEXT);
2752     subl(cnt2, stride);
2753     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
2754     addptr(str1, 16);
2755     if (ae == StrIntrinsicNode::UL) {
2756       addptr(str2, 8);
2757     } else {
2758       addptr(str2, 16);
2759     }
2760     subl(cnt1, stride);
2761     cmpl(cnt2, stride); // Do not read beyond substring
2762     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
2763     // Back-up strings to avoid reading beyond substring.
2764 
2765     if (ae == StrIntrinsicNode::UL) {
2766       lea(str2, Address(str2, cnt2, scale2, -8));
2767       lea(str1, Address(str1, cnt2, scale1, -16));
2768     } else {
2769       lea(str2, Address(str2, cnt2, scale2, -16));
2770       lea(str1, Address(str1, cnt2, scale1, -16));
2771     }
2772     subl(cnt1, cnt2);
2773     movl(cnt2, stride);
2774     addl(cnt1, stride);
2775     bind(CONT_SCAN_SUBSTR);
2776     if (ae == StrIntrinsicNode::UL) {
2777       pmovzxbw(vec, Address(str2, 0));
2778     } else {
2779       movdqu(vec, Address(str2, 0));
2780     }
2781     jmp(SCAN_SUBSTR);
2782 
2783     bind(RET_FOUND_LONG);
2784     movptr(str1, Address(rsp, wordSize));
2785   } // non constant
2786 
2787   bind(RET_FOUND);
2788   // Compute substr offset
2789   subptr(result, str1);
2790   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2791     shrl(result, 1); // index
2792   }
2793   bind(CLEANUP);
2794   pop(rsp); // restore SP
2795 
2796 } // string_indexof
2797 
2798 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
2799                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
2800   ShortBranchVerifier sbv(this);
2801   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2802 
2803   int stride = 8;
2804 
2805   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
2806         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
2807         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
2808         FOUND_SEQ_CHAR, DONE_LABEL;
2809 
2810   movptr(result, str1);
2811   if (UseAVX >= 2) {
2812     cmpl(cnt1, stride);
2813     jcc(Assembler::less, SCAN_TO_CHAR);
2814     cmpl(cnt1, 2*stride);
2815     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
2816     movdl(vec1, ch);
2817     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
2818     vpxor(vec2, vec2);
2819     movl(tmp, cnt1);
2820     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
2821     andl(cnt1,0x0000000F);  //tail count (in chars)
2822 
2823     bind(SCAN_TO_16_CHAR_LOOP);
2824     vmovdqu(vec3, Address(result, 0));
2825     vpcmpeqw(vec3, vec3, vec1, 1);
2826     vptest(vec2, vec3);
2827     jcc(Assembler::carryClear, FOUND_CHAR);
2828     addptr(result, 32);
2829     subl(tmp, 2*stride);
2830     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
2831     jmp(SCAN_TO_8_CHAR);
2832     bind(SCAN_TO_8_CHAR_INIT);
2833     movdl(vec1, ch);
2834     pshuflw(vec1, vec1, 0x00);
2835     pshufd(vec1, vec1, 0);
2836     pxor(vec2, vec2);
2837   }
2838   bind(SCAN_TO_8_CHAR);
2839   cmpl(cnt1, stride);
2840   jcc(Assembler::less, SCAN_TO_CHAR);
2841   if (UseAVX < 2) {
2842     movdl(vec1, ch);
2843     pshuflw(vec1, vec1, 0x00);
2844     pshufd(vec1, vec1, 0);
2845     pxor(vec2, vec2);
2846   }
2847   movl(tmp, cnt1);
2848   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
2849   andl(cnt1,0x00000007);  //tail count (in chars)
2850 
2851   bind(SCAN_TO_8_CHAR_LOOP);
2852   movdqu(vec3, Address(result, 0));
2853   pcmpeqw(vec3, vec1);
2854   ptest(vec2, vec3);
2855   jcc(Assembler::carryClear, FOUND_CHAR);
2856   addptr(result, 16);
2857   subl(tmp, stride);
2858   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
2859   bind(SCAN_TO_CHAR);
2860   testl(cnt1, cnt1);
2861   jcc(Assembler::zero, RET_NOT_FOUND);
2862   bind(SCAN_TO_CHAR_LOOP);
2863   load_unsigned_short(tmp, Address(result, 0));
2864   cmpl(ch, tmp);
2865   jccb(Assembler::equal, FOUND_SEQ_CHAR);
2866   addptr(result, 2);
2867   subl(cnt1, 1);
2868   jccb(Assembler::zero, RET_NOT_FOUND);
2869   jmp(SCAN_TO_CHAR_LOOP);
2870 
2871   bind(RET_NOT_FOUND);
2872   movl(result, -1);
2873   jmpb(DONE_LABEL);
2874 
2875   bind(FOUND_CHAR);
2876   if (UseAVX >= 2) {
2877     vpmovmskb(tmp, vec3);
2878   } else {
2879     pmovmskb(tmp, vec3);
2880   }
2881   bsfl(ch, tmp);
2882   addptr(result, ch);
2883 
2884   bind(FOUND_SEQ_CHAR);
2885   subptr(result, str1);
2886   shrl(result, 1);
2887 
2888   bind(DONE_LABEL);
2889 } // string_indexof_char
2890 
2891 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
2892                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
2893   ShortBranchVerifier sbv(this);
2894   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2895 
2896   int stride = 16;
2897 
2898   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
2899         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
2900         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
2901         FOUND_SEQ_CHAR, DONE_LABEL;
2902 
2903   movptr(result, str1);
2904   if (UseAVX >= 2) {
2905     cmpl(cnt1, stride);
2906     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
2907     cmpl(cnt1, stride*2);
2908     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
2909     movdl(vec1, ch);
2910     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
2911     vpxor(vec2, vec2);
2912     movl(tmp, cnt1);
2913     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
2914     andl(cnt1,0x0000001F);  //tail count (in chars)
2915 
2916     bind(SCAN_TO_32_CHAR_LOOP);
2917     vmovdqu(vec3, Address(result, 0));
2918     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
2919     vptest(vec2, vec3);
2920     jcc(Assembler::carryClear, FOUND_CHAR);
2921     addptr(result, 32);
2922     subl(tmp, stride*2);
2923     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
2924     jmp(SCAN_TO_16_CHAR);
2925 
2926     bind(SCAN_TO_16_CHAR_INIT);
2927     movdl(vec1, ch);
2928     pxor(vec2, vec2);
2929     pshufb(vec1, vec2);
2930   }
2931 
2932   bind(SCAN_TO_16_CHAR);
2933   cmpl(cnt1, stride);
2934   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entires left
2935   if (UseAVX < 2) {
2936     movdl(vec1, ch);
2937     pxor(vec2, vec2);
2938     pshufb(vec1, vec2);
2939   }
2940   movl(tmp, cnt1);
2941   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
2942   andl(cnt1,0x0000000F);  //tail count (in bytes)
2943 
2944   bind(SCAN_TO_16_CHAR_LOOP);
2945   movdqu(vec3, Address(result, 0));
2946   pcmpeqb(vec3, vec1);
2947   ptest(vec2, vec3);
2948   jcc(Assembler::carryClear, FOUND_CHAR);
2949   addptr(result, 16);
2950   subl(tmp, stride);
2951   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
2952 
2953   bind(SCAN_TO_CHAR_INIT);
2954   testl(cnt1, cnt1);
2955   jcc(Assembler::zero, RET_NOT_FOUND);
2956   bind(SCAN_TO_CHAR_LOOP);
2957   load_unsigned_byte(tmp, Address(result, 0));
2958   cmpl(ch, tmp);
2959   jccb(Assembler::equal, FOUND_SEQ_CHAR);
2960   addptr(result, 1);
2961   subl(cnt1, 1);
2962   jccb(Assembler::zero, RET_NOT_FOUND);
2963   jmp(SCAN_TO_CHAR_LOOP);
2964 
2965   bind(RET_NOT_FOUND);
2966   movl(result, -1);
2967   jmpb(DONE_LABEL);
2968 
2969   bind(FOUND_CHAR);
2970   if (UseAVX >= 2) {
2971     vpmovmskb(tmp, vec3);
2972   } else {
2973     pmovmskb(tmp, vec3);
2974   }
2975   bsfl(ch, tmp);
2976   addptr(result, ch);
2977 
2978   bind(FOUND_SEQ_CHAR);
2979   subptr(result, str1);
2980 
2981   bind(DONE_LABEL);
2982 } // stringL_indexof_char
2983 
2984 // helper function for string_compare
2985 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
2986                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
2987                                            Address::ScaleFactor scale2, Register index, int ae) {
2988   if (ae == StrIntrinsicNode::LL) {
2989     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
2990     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
2991   } else if (ae == StrIntrinsicNode::UU) {
2992     load_unsigned_short(elem1, Address(str1, index, scale, 0));
2993     load_unsigned_short(elem2, Address(str2, index, scale, 0));
2994   } else {
2995     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
2996     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
2997   }
2998 }
2999 
3000 // Compare strings, used for char[] and byte[].
3001 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3002                                        Register cnt1, Register cnt2, Register result,
3003                                        XMMRegister vec1, int ae, KRegister mask) {
3004   ShortBranchVerifier sbv(this);
3005   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3006   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3007   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3008   int stride2x2 = 0x40;
3009   Address::ScaleFactor scale = Address::no_scale;
3010   Address::ScaleFactor scale1 = Address::no_scale;
3011   Address::ScaleFactor scale2 = Address::no_scale;
3012 
3013   if (ae != StrIntrinsicNode::LL) {
3014     stride2x2 = 0x20;
3015   }
3016 
3017   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3018     shrl(cnt2, 1);
3019   }
3020   // Compute the minimum of the string lengths and the
3021   // difference of the string lengths (stack).
3022   // Do the conditional move stuff
3023   movl(result, cnt1);
3024   subl(cnt1, cnt2);
3025   push(cnt1);
3026   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3027 
3028   // Is the minimum length zero?
3029   testl(cnt2, cnt2);
3030   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3031   if (ae == StrIntrinsicNode::LL) {
3032     // Load first bytes
3033     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3034     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3035   } else if (ae == StrIntrinsicNode::UU) {
3036     // Load first characters
3037     load_unsigned_short(result, Address(str1, 0));
3038     load_unsigned_short(cnt1, Address(str2, 0));
3039   } else {
3040     load_unsigned_byte(result, Address(str1, 0));
3041     load_unsigned_short(cnt1, Address(str2, 0));
3042   }
3043   subl(result, cnt1);
3044   jcc(Assembler::notZero,  POP_LABEL);
3045 
3046   if (ae == StrIntrinsicNode::UU) {
3047     // Divide length by 2 to get number of chars
3048     shrl(cnt2, 1);
3049   }
3050   cmpl(cnt2, 1);
3051   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3052 
3053   // Check if the strings start at the same location and setup scale and stride
3054   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3055     cmpptr(str1, str2);
3056     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3057     if (ae == StrIntrinsicNode::LL) {
3058       scale = Address::times_1;
3059       stride = 16;
3060     } else {
3061       scale = Address::times_2;
3062       stride = 8;
3063     }
3064   } else {
3065     scale1 = Address::times_1;
3066     scale2 = Address::times_2;
3067     // scale not used
3068     stride = 8;
3069   }
3070 
3071   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3072     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3073     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3074     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3075     Label COMPARE_TAIL_LONG;
3076     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3077 
3078     int pcmpmask = 0x19;
3079     if (ae == StrIntrinsicNode::LL) {
3080       pcmpmask &= ~0x01;
3081     }
3082 
3083     // Setup to compare 16-chars (32-bytes) vectors,
3084     // start from first character again because it has aligned address.
3085     if (ae == StrIntrinsicNode::LL) {
3086       stride2 = 32;
3087     } else {
3088       stride2 = 16;
3089     }
3090     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3091       adr_stride = stride << scale;
3092     } else {
3093       adr_stride1 = 8;  //stride << scale1;
3094       adr_stride2 = 16; //stride << scale2;
3095     }
3096 
3097     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3098     // rax and rdx are used by pcmpestri as elements counters
3099     movl(result, cnt2);
3100     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3101     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3102 
3103     // fast path : compare first 2 8-char vectors.
3104     bind(COMPARE_16_CHARS);
3105     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3106       movdqu(vec1, Address(str1, 0));
3107     } else {
3108       pmovzxbw(vec1, Address(str1, 0));
3109     }
3110     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3111     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3112 
3113     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3114       movdqu(vec1, Address(str1, adr_stride));
3115       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3116     } else {
3117       pmovzxbw(vec1, Address(str1, adr_stride1));
3118       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3119     }
3120     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3121     addl(cnt1, stride);
3122 
3123     // Compare the characters at index in cnt1
3124     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3125     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3126     subl(result, cnt2);
3127     jmp(POP_LABEL);
3128 
3129     // Setup the registers to start vector comparison loop
3130     bind(COMPARE_WIDE_VECTORS);
3131     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3132       lea(str1, Address(str1, result, scale));
3133       lea(str2, Address(str2, result, scale));
3134     } else {
3135       lea(str1, Address(str1, result, scale1));
3136       lea(str2, Address(str2, result, scale2));
3137     }
3138     subl(result, stride2);
3139     subl(cnt2, stride2);
3140     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3141     negptr(result);
3142 
3143     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3144     bind(COMPARE_WIDE_VECTORS_LOOP);
3145 
3146 #ifdef _LP64
3147     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3148       cmpl(cnt2, stride2x2);
3149       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3150       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3151       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3152 
3153       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3154       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3155         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3156         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3157       } else {
3158         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3159         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3160       }
3161       kortestql(mask, mask);
3162       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3163       addptr(result, stride2x2);  // update since we already compared at this addr
3164       subl(cnt2, stride2x2);      // and sub the size too
3165       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3166 
3167       vpxor(vec1, vec1);
3168       jmpb(COMPARE_WIDE_TAIL);
3169     }//if (VM_Version::supports_avx512vlbw())
3170 #endif // _LP64
3171 
3172 
3173     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3174     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3175       vmovdqu(vec1, Address(str1, result, scale));
3176       vpxor(vec1, Address(str2, result, scale));
3177     } else {
3178       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3179       vpxor(vec1, Address(str2, result, scale2));
3180     }
3181     vptest(vec1, vec1);
3182     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3183     addptr(result, stride2);
3184     subl(cnt2, stride2);
3185     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3186     // clean upper bits of YMM registers
3187     vpxor(vec1, vec1);
3188 
3189     // compare wide vectors tail
3190     bind(COMPARE_WIDE_TAIL);
3191     testptr(result, result);
3192     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3193 
3194     movl(result, stride2);
3195     movl(cnt2, result);
3196     negptr(result);
3197     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3198 
3199     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3200     bind(VECTOR_NOT_EQUAL);
3201     // clean upper bits of YMM registers
3202     vpxor(vec1, vec1);
3203     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3204       lea(str1, Address(str1, result, scale));
3205       lea(str2, Address(str2, result, scale));
3206     } else {
3207       lea(str1, Address(str1, result, scale1));
3208       lea(str2, Address(str2, result, scale2));
3209     }
3210     jmp(COMPARE_16_CHARS);
3211 
3212     // Compare tail chars, length between 1 to 15 chars
3213     bind(COMPARE_TAIL_LONG);
3214     movl(cnt2, result);
3215     cmpl(cnt2, stride);
3216     jcc(Assembler::less, COMPARE_SMALL_STR);
3217 
3218     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3219       movdqu(vec1, Address(str1, 0));
3220     } else {
3221       pmovzxbw(vec1, Address(str1, 0));
3222     }
3223     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3224     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3225     subptr(cnt2, stride);
3226     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3227     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3228       lea(str1, Address(str1, result, scale));
3229       lea(str2, Address(str2, result, scale));
3230     } else {
3231       lea(str1, Address(str1, result, scale1));
3232       lea(str2, Address(str2, result, scale2));
3233     }
3234     negptr(cnt2);
3235     jmpb(WHILE_HEAD_LABEL);
3236 
3237     bind(COMPARE_SMALL_STR);
3238   } else if (UseSSE42Intrinsics) {
3239     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3240     int pcmpmask = 0x19;
3241     // Setup to compare 8-char (16-byte) vectors,
3242     // start from first character again because it has aligned address.
3243     movl(result, cnt2);
3244     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3245     if (ae == StrIntrinsicNode::LL) {
3246       pcmpmask &= ~0x01;
3247     }
3248     jcc(Assembler::zero, COMPARE_TAIL);
3249     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3250       lea(str1, Address(str1, result, scale));
3251       lea(str2, Address(str2, result, scale));
3252     } else {
3253       lea(str1, Address(str1, result, scale1));
3254       lea(str2, Address(str2, result, scale2));
3255     }
3256     negptr(result);
3257 
3258     // pcmpestri
3259     //   inputs:
3260     //     vec1- substring
3261     //     rax - negative string length (elements count)
3262     //     mem - scanned string
3263     //     rdx - string length (elements count)
3264     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3265     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3266     //   outputs:
3267     //     rcx - first mismatched element index
3268     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3269 
3270     bind(COMPARE_WIDE_VECTORS);
3271     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3272       movdqu(vec1, Address(str1, result, scale));
3273       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3274     } else {
3275       pmovzxbw(vec1, Address(str1, result, scale1));
3276       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3277     }
3278     // After pcmpestri cnt1(rcx) contains mismatched element index
3279 
3280     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3281     addptr(result, stride);
3282     subptr(cnt2, stride);
3283     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3284 
3285     // compare wide vectors tail
3286     testptr(result, result);
3287     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3288 
3289     movl(cnt2, stride);
3290     movl(result, stride);
3291     negptr(result);
3292     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3293       movdqu(vec1, Address(str1, result, scale));
3294       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3295     } else {
3296       pmovzxbw(vec1, Address(str1, result, scale1));
3297       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3298     }
3299     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3300 
3301     // Mismatched characters in the vectors
3302     bind(VECTOR_NOT_EQUAL);
3303     addptr(cnt1, result);
3304     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3305     subl(result, cnt2);
3306     jmpb(POP_LABEL);
3307 
3308     bind(COMPARE_TAIL); // limit is zero
3309     movl(cnt2, result);
3310     // Fallthru to tail compare
3311   }
3312   // Shift str2 and str1 to the end of the arrays, negate min
3313   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3314     lea(str1, Address(str1, cnt2, scale));
3315     lea(str2, Address(str2, cnt2, scale));
3316   } else {
3317     lea(str1, Address(str1, cnt2, scale1));
3318     lea(str2, Address(str2, cnt2, scale2));
3319   }
3320   decrementl(cnt2);  // first character was compared already
3321   negptr(cnt2);
3322 
3323   // Compare the rest of the elements
3324   bind(WHILE_HEAD_LABEL);
3325   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3326   subl(result, cnt1);
3327   jccb(Assembler::notZero, POP_LABEL);
3328   increment(cnt2);
3329   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3330 
3331   // Strings are equal up to min length.  Return the length difference.
3332   bind(LENGTH_DIFF_LABEL);
3333   pop(result);
3334   if (ae == StrIntrinsicNode::UU) {
3335     // Divide diff by 2 to get number of chars
3336     sarl(result, 1);
3337   }
3338   jmpb(DONE_LABEL);
3339 
3340 #ifdef _LP64
3341   if (VM_Version::supports_avx512vlbw()) {
3342 
3343     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3344 
3345     kmovql(cnt1, mask);
3346     notq(cnt1);
3347     bsfq(cnt2, cnt1);
3348     if (ae != StrIntrinsicNode::LL) {
3349       // Divide diff by 2 to get number of chars
3350       sarl(cnt2, 1);
3351     }
3352     addq(result, cnt2);
3353     if (ae == StrIntrinsicNode::LL) {
3354       load_unsigned_byte(cnt1, Address(str2, result));
3355       load_unsigned_byte(result, Address(str1, result));
3356     } else if (ae == StrIntrinsicNode::UU) {
3357       load_unsigned_short(cnt1, Address(str2, result, scale));
3358       load_unsigned_short(result, Address(str1, result, scale));
3359     } else {
3360       load_unsigned_short(cnt1, Address(str2, result, scale2));
3361       load_unsigned_byte(result, Address(str1, result, scale1));
3362     }
3363     subl(result, cnt1);
3364     jmpb(POP_LABEL);
3365   }//if (VM_Version::supports_avx512vlbw())
3366 #endif // _LP64
3367 
3368   // Discard the stored length difference
3369   bind(POP_LABEL);
3370   pop(cnt1);
3371 
3372   // That's it
3373   bind(DONE_LABEL);
3374   if(ae == StrIntrinsicNode::UL) {
3375     negl(result);
3376   }
3377 
3378 }
3379 
3380 // Search for Non-ASCII character (Negative byte value) in a byte array,
3381 // return the index of the first such character, otherwise the length
3382 // of the array segment searched.
3383 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3384 //   @IntrinsicCandidate
3385 //   public static int countPositives(byte[] ba, int off, int len) {
3386 //     for (int i = off; i < off + len; i++) {
3387 //       if (ba[i] < 0) {
3388 //         return i - off;
3389 //       }
3390 //     }
3391 //     return len;
3392 //   }
3393 void C2_MacroAssembler::count_positives(Register ary1, Register len,
3394   Register result, Register tmp1,
3395   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3396   // rsi: byte array
3397   // rcx: len
3398   // rax: result
3399   ShortBranchVerifier sbv(this);
3400   assert_different_registers(ary1, len, result, tmp1);
3401   assert_different_registers(vec1, vec2);
3402   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3403 
3404   movl(result, len); // copy
3405   // len == 0
3406   testl(len, len);
3407   jcc(Assembler::zero, DONE);
3408 
3409   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3410     VM_Version::supports_avx512vlbw() &&
3411     VM_Version::supports_bmi2()) {
3412 
3413     Label test_64_loop, test_tail, BREAK_LOOP;
3414     Register tmp3_aliased = len;
3415 
3416     movl(tmp1, len);
3417     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3418 
3419     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
3420     andl(len, ~(64 - 1));    // vector count (in chars)
3421     jccb(Assembler::zero, test_tail);
3422 
3423     lea(ary1, Address(ary1, len, Address::times_1));
3424     negptr(len);
3425 
3426     bind(test_64_loop);
3427     // Check whether our 64 elements of size byte contain negatives
3428     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3429     kortestql(mask1, mask1);
3430     jcc(Assembler::notZero, BREAK_LOOP);
3431 
3432     addptr(len, 64);
3433     jccb(Assembler::notZero, test_64_loop);
3434 
3435     bind(test_tail);
3436     // bail out when there is nothing to be done
3437     testl(tmp1, -1);
3438     jcc(Assembler::zero, DONE);
3439 
3440     // ~(~0 << len) applied up to two times (for 32-bit scenario)
3441 #ifdef _LP64
3442     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3443     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3444     notq(tmp3_aliased);
3445     kmovql(mask2, tmp3_aliased);
3446 #else
3447     Label k_init;
3448     jmp(k_init);
3449 
3450     // We could not read 64-bits from a general purpose register thus we move
3451     // data required to compose 64 1's to the instruction stream
3452     // We emit 64 byte wide series of elements from 0..63 which later on would
3453     // be used as a compare targets with tail count contained in tmp1 register.
3454     // Result would be a k register having tmp1 consecutive number or 1
3455     // counting from least significant bit.
3456     address tmp = pc();
3457     emit_int64(0x0706050403020100);
3458     emit_int64(0x0F0E0D0C0B0A0908);
3459     emit_int64(0x1716151413121110);
3460     emit_int64(0x1F1E1D1C1B1A1918);
3461     emit_int64(0x2726252423222120);
3462     emit_int64(0x2F2E2D2C2B2A2928);
3463     emit_int64(0x3736353433323130);
3464     emit_int64(0x3F3E3D3C3B3A3938);
3465 
3466     bind(k_init);
3467     lea(len, InternalAddress(tmp));
3468     // create mask to test for negative byte inside a vector
3469     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3470     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
3471 
3472 #endif
3473     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3474     ktestq(mask1, mask2);
3475     jcc(Assembler::zero, DONE);
3476 
3477     bind(BREAK_LOOP);
3478     // At least one byte in the last 64 bytes is negative.
3479     // Set up to look at the last 64 bytes as if they were a tail
3480     lea(ary1, Address(ary1, len, Address::times_1));
3481     addptr(result, len);
3482     // Ignore the very last byte: if all others are positive,
3483     // it must be negative, so we can skip right to the 2+1 byte
3484     // end comparison at this point
3485     orl(result, 63);
3486     movl(len, 63);
3487     // Fallthru to tail compare
3488   } else {
3489 
3490     if (UseAVX >= 2 && UseSSE >= 2) {
3491       // With AVX2, use 32-byte vector compare
3492       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
3493 
3494       // Compare 32-byte vectors
3495       testl(len, 0xffffffe0);   // vector count (in bytes)
3496       jccb(Assembler::zero, TAIL_START);
3497 
3498       andl(len, 0xffffffe0);
3499       lea(ary1, Address(ary1, len, Address::times_1));
3500       negptr(len);
3501 
3502       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
3503       movdl(vec2, tmp1);
3504       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
3505 
3506       bind(COMPARE_WIDE_VECTORS);
3507       vmovdqu(vec1, Address(ary1, len, Address::times_1));
3508       vptest(vec1, vec2);
3509       jccb(Assembler::notZero, BREAK_LOOP);
3510       addptr(len, 32);
3511       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3512 
3513       testl(result, 0x0000001f);   // any bytes remaining?
3514       jcc(Assembler::zero, DONE);
3515 
3516       // Quick test using the already prepared vector mask
3517       movl(len, result);
3518       andl(len, 0x0000001f);
3519       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
3520       vptest(vec1, vec2);
3521       jcc(Assembler::zero, DONE);
3522       // There are zeros, jump to the tail to determine exactly where
3523       jmpb(TAIL_START);
3524 
3525       bind(BREAK_LOOP);
3526       // At least one byte in the last 32-byte vector is negative.
3527       // Set up to look at the last 32 bytes as if they were a tail
3528       lea(ary1, Address(ary1, len, Address::times_1));
3529       addptr(result, len);
3530       // Ignore the very last byte: if all others are positive,
3531       // it must be negative, so we can skip right to the 2+1 byte
3532       // end comparison at this point
3533       orl(result, 31);
3534       movl(len, 31);
3535       // Fallthru to tail compare
3536     } else if (UseSSE42Intrinsics) {
3537       // With SSE4.2, use double quad vector compare
3538       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
3539 
3540       // Compare 16-byte vectors
3541       testl(len, 0xfffffff0);   // vector count (in bytes)
3542       jcc(Assembler::zero, TAIL_START);
3543 
3544       andl(len, 0xfffffff0);
3545       lea(ary1, Address(ary1, len, Address::times_1));
3546       negptr(len);
3547 
3548       movl(tmp1, 0x80808080);
3549       movdl(vec2, tmp1);
3550       pshufd(vec2, vec2, 0);
3551 
3552       bind(COMPARE_WIDE_VECTORS);
3553       movdqu(vec1, Address(ary1, len, Address::times_1));
3554       ptest(vec1, vec2);
3555       jccb(Assembler::notZero, BREAK_LOOP);
3556       addptr(len, 16);
3557       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3558 
3559       testl(result, 0x0000000f); // len is zero, any bytes remaining?
3560       jcc(Assembler::zero, DONE);
3561 
3562       // Quick test using the already prepared vector mask
3563       movl(len, result);
3564       andl(len, 0x0000000f);   // tail count (in bytes)
3565       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
3566       ptest(vec1, vec2);
3567       jcc(Assembler::zero, DONE);
3568       jmpb(TAIL_START);
3569 
3570       bind(BREAK_LOOP);
3571       // At least one byte in the last 16-byte vector is negative.
3572       // Set up and look at the last 16 bytes as if they were a tail
3573       lea(ary1, Address(ary1, len, Address::times_1));
3574       addptr(result, len);
3575       // Ignore the very last byte: if all others are positive,
3576       // it must be negative, so we can skip right to the 2+1 byte
3577       // end comparison at this point
3578       orl(result, 15);
3579       movl(len, 15);
3580       // Fallthru to tail compare
3581     }
3582   }
3583 
3584   bind(TAIL_START);
3585   // Compare 4-byte vectors
3586   andl(len, 0xfffffffc); // vector count (in bytes)
3587   jccb(Assembler::zero, COMPARE_CHAR);
3588 
3589   lea(ary1, Address(ary1, len, Address::times_1));
3590   negptr(len);
3591 
3592   bind(COMPARE_VECTORS);
3593   movl(tmp1, Address(ary1, len, Address::times_1));
3594   andl(tmp1, 0x80808080);
3595   jccb(Assembler::notZero, TAIL_ADJUST);
3596   addptr(len, 4);
3597   jccb(Assembler::notZero, COMPARE_VECTORS);
3598 
3599   // Compare trailing char (final 2-3 bytes), if any
3600   bind(COMPARE_CHAR);
3601 
3602   testl(result, 0x2);   // tail  char
3603   jccb(Assembler::zero, COMPARE_BYTE);
3604   load_unsigned_short(tmp1, Address(ary1, 0));
3605   andl(tmp1, 0x00008080);
3606   jccb(Assembler::notZero, CHAR_ADJUST);
3607   lea(ary1, Address(ary1, 2));
3608 
3609   bind(COMPARE_BYTE);
3610   testl(result, 0x1);   // tail  byte
3611   jccb(Assembler::zero, DONE);
3612   load_unsigned_byte(tmp1, Address(ary1, 0));
3613   testl(tmp1, 0x00000080);
3614   jccb(Assembler::zero, DONE);
3615   subptr(result, 1);
3616   jmpb(DONE);
3617 
3618   bind(TAIL_ADJUST);
3619   // there are negative bits in the last 4 byte block.
3620   // Adjust result and check the next three bytes
3621   addptr(result, len);
3622   orl(result, 3);
3623   lea(ary1, Address(ary1, len, Address::times_1));
3624   jmpb(COMPARE_CHAR);
3625 
3626   bind(CHAR_ADJUST);
3627   // We are looking at a char + optional byte tail, and found that one
3628   // of the bytes in the char is negative. Adjust the result, check the
3629   // first byte and readjust if needed.
3630   andl(result, 0xfffffffc);
3631   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
3632   jccb(Assembler::notZero, DONE);
3633   addptr(result, 1);
3634 
3635   // That's it
3636   bind(DONE);
3637   if (UseAVX >= 2 && UseSSE >= 2) {
3638     // clean upper bits of YMM registers
3639     vpxor(vec1, vec1);
3640     vpxor(vec2, vec2);
3641   }
3642 }
3643 
3644 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
3645 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
3646                                       Register limit, Register result, Register chr,
3647                                       XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
3648   ShortBranchVerifier sbv(this);
3649   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
3650 
3651   int length_offset  = arrayOopDesc::length_offset_in_bytes();
3652   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
3653 
3654   if (is_array_equ) {
3655     // Check the input args
3656     cmpoop(ary1, ary2);
3657     jcc(Assembler::equal, TRUE_LABEL);
3658 
3659     // Need additional checks for arrays_equals.
3660     testptr(ary1, ary1);
3661     jcc(Assembler::zero, FALSE_LABEL);
3662     testptr(ary2, ary2);
3663     jcc(Assembler::zero, FALSE_LABEL);
3664 
3665     // Check the lengths
3666     movl(limit, Address(ary1, length_offset));
3667     cmpl(limit, Address(ary2, length_offset));
3668     jcc(Assembler::notEqual, FALSE_LABEL);
3669   }
3670 
3671   // count == 0
3672   testl(limit, limit);
3673   jcc(Assembler::zero, TRUE_LABEL);
3674 
3675   if (is_array_equ) {
3676     // Load array address
3677     lea(ary1, Address(ary1, base_offset));
3678     lea(ary2, Address(ary2, base_offset));
3679   }
3680 
3681   if (is_array_equ && is_char) {
3682     // arrays_equals when used for char[].
3683     shll(limit, 1);      // byte count != 0
3684   }
3685   movl(result, limit); // copy
3686 
3687   if (UseAVX >= 2) {
3688     // With AVX2, use 32-byte vector compare
3689     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3690 
3691     // Compare 32-byte vectors
3692     andl(result, 0x0000001f);  //   tail count (in bytes)
3693     andl(limit, 0xffffffe0);   // vector count (in bytes)
3694     jcc(Assembler::zero, COMPARE_TAIL);
3695 
3696     lea(ary1, Address(ary1, limit, Address::times_1));
3697     lea(ary2, Address(ary2, limit, Address::times_1));
3698     negptr(limit);
3699 
3700 #ifdef _LP64
3701     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3702       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
3703 
3704       cmpl(limit, -64);
3705       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3706 
3707       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3708 
3709       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
3710       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
3711       kortestql(mask, mask);
3712       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3713       addptr(limit, 64);  // update since we already compared at this addr
3714       cmpl(limit, -64);
3715       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3716 
3717       // At this point we may still need to compare -limit+result bytes.
3718       // We could execute the next two instruction and just continue via non-wide path:
3719       //  cmpl(limit, 0);
3720       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
3721       // But since we stopped at the points ary{1,2}+limit which are
3722       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
3723       // (|limit| <= 32 and result < 32),
3724       // we may just compare the last 64 bytes.
3725       //
3726       addptr(result, -64);   // it is safe, bc we just came from this area
3727       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
3728       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
3729       kortestql(mask, mask);
3730       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3731 
3732       jmp(TRUE_LABEL);
3733 
3734       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3735 
3736     }//if (VM_Version::supports_avx512vlbw())
3737 #endif //_LP64
3738     bind(COMPARE_WIDE_VECTORS);
3739     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
3740     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
3741     vpxor(vec1, vec2);
3742 
3743     vptest(vec1, vec1);
3744     jcc(Assembler::notZero, FALSE_LABEL);
3745     addptr(limit, 32);
3746     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3747 
3748     testl(result, result);
3749     jcc(Assembler::zero, TRUE_LABEL);
3750 
3751     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3752     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
3753     vpxor(vec1, vec2);
3754 
3755     vptest(vec1, vec1);
3756     jccb(Assembler::notZero, FALSE_LABEL);
3757     jmpb(TRUE_LABEL);
3758 
3759     bind(COMPARE_TAIL); // limit is zero
3760     movl(limit, result);
3761     // Fallthru to tail compare
3762   } else if (UseSSE42Intrinsics) {
3763     // With SSE4.2, use double quad vector compare
3764     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3765 
3766     // Compare 16-byte vectors
3767     andl(result, 0x0000000f);  //   tail count (in bytes)
3768     andl(limit, 0xfffffff0);   // vector count (in bytes)
3769     jcc(Assembler::zero, COMPARE_TAIL);
3770 
3771     lea(ary1, Address(ary1, limit, Address::times_1));
3772     lea(ary2, Address(ary2, limit, Address::times_1));
3773     negptr(limit);
3774 
3775     bind(COMPARE_WIDE_VECTORS);
3776     movdqu(vec1, Address(ary1, limit, Address::times_1));
3777     movdqu(vec2, Address(ary2, limit, Address::times_1));
3778     pxor(vec1, vec2);
3779 
3780     ptest(vec1, vec1);
3781     jcc(Assembler::notZero, FALSE_LABEL);
3782     addptr(limit, 16);
3783     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3784 
3785     testl(result, result);
3786     jcc(Assembler::zero, TRUE_LABEL);
3787 
3788     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3789     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
3790     pxor(vec1, vec2);
3791 
3792     ptest(vec1, vec1);
3793     jccb(Assembler::notZero, FALSE_LABEL);
3794     jmpb(TRUE_LABEL);
3795 
3796     bind(COMPARE_TAIL); // limit is zero
3797     movl(limit, result);
3798     // Fallthru to tail compare
3799   }
3800 
3801   // Compare 4-byte vectors
3802   andl(limit, 0xfffffffc); // vector count (in bytes)
3803   jccb(Assembler::zero, COMPARE_CHAR);
3804 
3805   lea(ary1, Address(ary1, limit, Address::times_1));
3806   lea(ary2, Address(ary2, limit, Address::times_1));
3807   negptr(limit);
3808 
3809   bind(COMPARE_VECTORS);
3810   movl(chr, Address(ary1, limit, Address::times_1));
3811   cmpl(chr, Address(ary2, limit, Address::times_1));
3812   jccb(Assembler::notEqual, FALSE_LABEL);
3813   addptr(limit, 4);
3814   jcc(Assembler::notZero, COMPARE_VECTORS);
3815 
3816   // Compare trailing char (final 2 bytes), if any
3817   bind(COMPARE_CHAR);
3818   testl(result, 0x2);   // tail  char
3819   jccb(Assembler::zero, COMPARE_BYTE);
3820   load_unsigned_short(chr, Address(ary1, 0));
3821   load_unsigned_short(limit, Address(ary2, 0));
3822   cmpl(chr, limit);
3823   jccb(Assembler::notEqual, FALSE_LABEL);
3824 
3825   if (is_array_equ && is_char) {
3826     bind(COMPARE_BYTE);
3827   } else {
3828     lea(ary1, Address(ary1, 2));
3829     lea(ary2, Address(ary2, 2));
3830 
3831     bind(COMPARE_BYTE);
3832     testl(result, 0x1);   // tail  byte
3833     jccb(Assembler::zero, TRUE_LABEL);
3834     load_unsigned_byte(chr, Address(ary1, 0));
3835     load_unsigned_byte(limit, Address(ary2, 0));
3836     cmpl(chr, limit);
3837     jccb(Assembler::notEqual, FALSE_LABEL);
3838   }
3839   bind(TRUE_LABEL);
3840   movl(result, 1);   // return true
3841   jmpb(DONE);
3842 
3843   bind(FALSE_LABEL);
3844   xorl(result, result); // return false
3845 
3846   // That's it
3847   bind(DONE);
3848   if (UseAVX >= 2) {
3849     // clean upper bits of YMM registers
3850     vpxor(vec1, vec1);
3851     vpxor(vec2, vec2);
3852   }
3853 }
3854 
3855 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
3856                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
3857   switch(ideal_opc) {
3858     case Op_LShiftVS:
3859       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
3860     case Op_LShiftVI:
3861       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
3862     case Op_LShiftVL:
3863       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
3864     case Op_RShiftVS:
3865       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
3866     case Op_RShiftVI:
3867       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
3868     case Op_RShiftVL:
3869       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
3870     case Op_URShiftVS:
3871       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
3872     case Op_URShiftVI:
3873       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
3874     case Op_URShiftVL:
3875       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
3876     case Op_RotateRightV:
3877       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
3878     case Op_RotateLeftV:
3879       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
3880     default:
3881       fatal("Unsupported masked operation"); break;
3882   }
3883 }
3884 
3885 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
3886                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
3887                                     bool is_varshift) {
3888   switch (ideal_opc) {
3889     case Op_AddVB:
3890       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
3891     case Op_AddVS:
3892       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
3893     case Op_AddVI:
3894       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
3895     case Op_AddVL:
3896       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
3897     case Op_AddVF:
3898       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
3899     case Op_AddVD:
3900       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
3901     case Op_SubVB:
3902       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
3903     case Op_SubVS:
3904       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
3905     case Op_SubVI:
3906       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
3907     case Op_SubVL:
3908       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
3909     case Op_SubVF:
3910       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
3911     case Op_SubVD:
3912       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
3913     case Op_MulVS:
3914       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
3915     case Op_MulVI:
3916       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
3917     case Op_MulVL:
3918       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
3919     case Op_MulVF:
3920       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
3921     case Op_MulVD:
3922       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
3923     case Op_DivVF:
3924       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
3925     case Op_DivVD:
3926       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
3927     case Op_SqrtVF:
3928       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
3929     case Op_SqrtVD:
3930       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
3931     case Op_AbsVB:
3932       evpabsb(dst, mask, src2, merge, vlen_enc); break;
3933     case Op_AbsVS:
3934       evpabsw(dst, mask, src2, merge, vlen_enc); break;
3935     case Op_AbsVI:
3936       evpabsd(dst, mask, src2, merge, vlen_enc); break;
3937     case Op_AbsVL:
3938       evpabsq(dst, mask, src2, merge, vlen_enc); break;
3939     case Op_FmaVF:
3940       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
3941     case Op_FmaVD:
3942       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
3943     case Op_VectorRearrange:
3944       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
3945     case Op_LShiftVS:
3946       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3947     case Op_LShiftVI:
3948       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3949     case Op_LShiftVL:
3950       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3951     case Op_RShiftVS:
3952       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3953     case Op_RShiftVI:
3954       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3955     case Op_RShiftVL:
3956       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3957     case Op_URShiftVS:
3958       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3959     case Op_URShiftVI:
3960       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3961     case Op_URShiftVL:
3962       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3963     case Op_RotateLeftV:
3964       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
3965     case Op_RotateRightV:
3966       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
3967     case Op_MaxV:
3968       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
3969     case Op_MinV:
3970       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
3971     case Op_XorV:
3972       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
3973     case Op_OrV:
3974       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
3975     case Op_AndV:
3976       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
3977     default:
3978       fatal("Unsupported masked operation"); break;
3979   }
3980 }
3981 
3982 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
3983                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
3984   switch (ideal_opc) {
3985     case Op_AddVB:
3986       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
3987     case Op_AddVS:
3988       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
3989     case Op_AddVI:
3990       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
3991     case Op_AddVL:
3992       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
3993     case Op_AddVF:
3994       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
3995     case Op_AddVD:
3996       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
3997     case Op_SubVB:
3998       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
3999     case Op_SubVS:
4000       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4001     case Op_SubVI:
4002       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4003     case Op_SubVL:
4004       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4005     case Op_SubVF:
4006       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4007     case Op_SubVD:
4008       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4009     case Op_MulVS:
4010       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4011     case Op_MulVI:
4012       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4013     case Op_MulVL:
4014       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4015     case Op_MulVF:
4016       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4017     case Op_MulVD:
4018       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4019     case Op_DivVF:
4020       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4021     case Op_DivVD:
4022       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4023     case Op_FmaVF:
4024       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4025     case Op_FmaVD:
4026       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4027     case Op_MaxV:
4028       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4029     case Op_MinV:
4030       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4031     case Op_XorV:
4032       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4033     case Op_OrV:
4034       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4035     case Op_AndV:
4036       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4037     default:
4038       fatal("Unsupported masked operation"); break;
4039   }
4040 }
4041 
4042 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4043                                   KRegister src1, KRegister src2) {
4044   BasicType etype = T_ILLEGAL;
4045   switch(mask_len) {
4046     case 2:
4047     case 4:
4048     case 8:  etype = T_BYTE; break;
4049     case 16: etype = T_SHORT; break;
4050     case 32: etype = T_INT; break;
4051     case 64: etype = T_LONG; break;
4052     default: fatal("Unsupported type"); break;
4053   }
4054   assert(etype != T_ILLEGAL, "");
4055   switch(ideal_opc) {
4056     case Op_AndVMask:
4057       kand(etype, dst, src1, src2); break;
4058     case Op_OrVMask:
4059       kor(etype, dst, src1, src2); break;
4060     case Op_XorVMask:
4061       kxor(etype, dst, src1, src2); break;
4062     default:
4063       fatal("Unsupported masked operation"); break;
4064   }
4065 }
4066 
4067 /*
4068  * Algorithm for vector D2L and F2I conversions:-
4069  * a) Perform vector D2L/F2I cast.
4070  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
4071  *    It signifies that source value could be any of the special floating point
4072  *    values(NaN,-Inf,Inf,Max,-Min).
4073  * c) Set destination to zero if source is NaN value.
4074  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
4075  */
4076 
4077 void C2_MacroAssembler::vector_castD2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4078                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
4079                                             Register scratch, int vec_enc) {
4080   Label done;
4081   evcvttpd2qq(dst, src, vec_enc);
4082   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, scratch);
4083   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4084   kortestwl(ktmp1, ktmp1);
4085   jccb(Assembler::equal, done);
4086 
4087   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4088   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4089   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4090 
4091   kxorwl(ktmp1, ktmp1, ktmp2);
4092   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4093   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4094   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4095   bind(done);
4096 }
4097 
4098 void C2_MacroAssembler::vector_castF2I_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4099                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4100                                            AddressLiteral float_sign_flip, Register scratch, int vec_enc) {
4101   Label done;
4102   vcvttps2dq(dst, src, vec_enc);
4103   vmovdqu(xtmp1, float_sign_flip, scratch, vec_enc);
4104   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4105   vptest(xtmp2, xtmp2, vec_enc);
4106   jccb(Assembler::equal, done);
4107 
4108   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4109   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4110 
4111   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4112   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4113   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4114 
4115   // Recompute the mask for remaining special value.
4116   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4117   // Extract SRC values corresponding to TRUE mask lanes.
4118   vpand(xtmp4, xtmp2, src, vec_enc);
4119   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4120   // values are set.
4121   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4122 
4123   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4124   bind(done);
4125 }
4126 
4127 void C2_MacroAssembler::vector_castF2I_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4128                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
4129                                             Register scratch, int vec_enc) {
4130   Label done;
4131   vcvttps2dq(dst, src, vec_enc);
4132   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, scratch);
4133   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4134   kortestwl(ktmp1, ktmp1);
4135   jccb(Assembler::equal, done);
4136 
4137   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4138   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4139   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4140 
4141   kxorwl(ktmp1, ktmp1, ktmp2);
4142   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4143   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4144   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4145   bind(done);
4146 }
4147 
4148 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
4149                                              BasicType from_elem_bt, BasicType to_elem_bt) {
4150   switch (from_elem_bt) {
4151     case T_BYTE:
4152       switch (to_elem_bt) {
4153         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
4154         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
4155         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
4156         default: ShouldNotReachHere();
4157       }
4158       break;
4159     case T_SHORT:
4160       switch (to_elem_bt) {
4161         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
4162         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
4163         default: ShouldNotReachHere();
4164       }
4165       break;
4166     case T_INT:
4167       assert(to_elem_bt == T_LONG, "");
4168       vpmovzxdq(dst, src, vlen_enc);
4169       break;
4170     default:
4171       ShouldNotReachHere();
4172   }
4173 }
4174 
4175 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
4176                                    bool merge, BasicType bt, int vlen_enc) {
4177   if (bt == T_INT) {
4178     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
4179   } else {
4180     assert(bt == T_LONG, "");
4181     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
4182   }
4183 }
4184 
4185 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
4186                                    bool merge, BasicType bt, int vlen_enc) {
4187   if (bt == T_INT) {
4188     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
4189   } else {
4190     assert(bt == T_LONG, "");
4191     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
4192   }
4193 }
4194 
4195 #ifdef _LP64
4196 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
4197                                                Register rtmp2, XMMRegister xtmp, int mask_len,
4198                                                int vec_enc) {
4199   int index = 0;
4200   int vindex = 0;
4201   mov64(rtmp1, 0x0101010101010101L);
4202   pdep(rtmp1, src, rtmp1);
4203   if (mask_len > 8) {
4204     movq(rtmp2, src);
4205     vpxor(xtmp, xtmp, xtmp, vec_enc);
4206     movq(xtmp, rtmp1);
4207   }
4208   movq(dst, rtmp1);
4209 
4210   mask_len -= 8;
4211   while (mask_len > 0) {
4212     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
4213     index++;
4214     if ((index % 2) == 0) {
4215       pxor(xtmp, xtmp);
4216     }
4217     mov64(rtmp1, 0x0101010101010101L);
4218     shrq(rtmp2, 8);
4219     pdep(rtmp1, rtmp2, rtmp1);
4220     pinsrq(xtmp, rtmp1, index % 2);
4221     vindex = index / 2;
4222     if (vindex) {
4223       // Write entire 16 byte vector when both 64 bit
4224       // lanes are update to save redundant instructions.
4225       if (index % 2) {
4226         vinsertf128(dst, dst, xtmp, vindex);
4227       }
4228     } else {
4229       vmovdqu(dst, xtmp);
4230     }
4231     mask_len -= 8;
4232   }
4233 }
4234 
4235 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
4236   switch(opc) {
4237     case Op_VectorMaskTrueCount:
4238       popcntq(dst, tmp);
4239       break;
4240     case Op_VectorMaskLastTrue:
4241       if (VM_Version::supports_lzcnt()) {
4242         lzcntq(tmp, tmp);
4243         movl(dst, 63);
4244         subl(dst, tmp);
4245       } else {
4246         movl(dst, -1);
4247         bsrq(tmp, tmp);
4248         cmov32(Assembler::notZero, dst, tmp);
4249       }
4250       break;
4251     case Op_VectorMaskFirstTrue:
4252       if (VM_Version::supports_bmi1()) {
4253         if (masklen < 32) {
4254           orl(tmp, 1 << masklen);
4255           tzcntl(dst, tmp);
4256         } else if (masklen == 32) {
4257           tzcntl(dst, tmp);
4258         } else {
4259           assert(masklen == 64, "");
4260           tzcntq(dst, tmp);
4261         }
4262       } else {
4263         if (masklen < 32) {
4264           orl(tmp, 1 << masklen);
4265           bsfl(dst, tmp);
4266         } else {
4267           assert(masklen == 32 || masklen == 64, "");
4268           movl(dst, masklen);
4269           if (masklen == 32)  {
4270             bsfl(tmp, tmp);
4271           } else {
4272             bsfq(tmp, tmp);
4273           }
4274           cmov32(Assembler::notZero, dst, tmp);
4275         }
4276       }
4277       break;
4278     case Op_VectorMaskToLong:
4279       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
4280       break;
4281     default: assert(false, "Unhandled mask operation");
4282   }
4283 }
4284 
4285 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
4286                                               int masklen, int masksize, int vec_enc) {
4287   assert(VM_Version::supports_popcnt(), "");
4288 
4289   if(VM_Version::supports_avx512bw()) {
4290     kmovql(tmp, mask);
4291   } else {
4292     assert(masklen <= 16, "");
4293     kmovwl(tmp, mask);
4294   }
4295 
4296   // Mask generated out of partial vector comparisons/replicate/mask manipulation
4297   // operations needs to be clipped.
4298   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
4299     andq(tmp, (1 << masklen) - 1);
4300   }
4301 
4302   vector_mask_operation_helper(opc, dst, tmp, masklen);
4303 }
4304 
4305 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
4306                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
4307   assert(vec_enc == AVX_128bit && VM_Version::supports_avx() ||
4308          vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4), "");
4309   assert(VM_Version::supports_popcnt(), "");
4310 
4311   bool need_clip = false;
4312   switch(bt) {
4313     case T_BOOLEAN:
4314       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
4315       vpxor(xtmp, xtmp, xtmp, vec_enc);
4316       vpsubb(xtmp, xtmp, mask, vec_enc);
4317       vpmovmskb(tmp, xtmp, vec_enc);
4318       need_clip = masklen < 16;
4319       break;
4320     case T_BYTE:
4321       vpmovmskb(tmp, mask, vec_enc);
4322       need_clip = masklen < 16;
4323       break;
4324     case T_SHORT:
4325       vpacksswb(xtmp, mask, mask, vec_enc);
4326       if (masklen >= 16) {
4327         vpermpd(xtmp, xtmp, 8, vec_enc);
4328       }
4329       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
4330       need_clip = masklen < 16;
4331       break;
4332     case T_INT:
4333     case T_FLOAT:
4334       vmovmskps(tmp, mask, vec_enc);
4335       need_clip = masklen < 4;
4336       break;
4337     case T_LONG:
4338     case T_DOUBLE:
4339       vmovmskpd(tmp, mask, vec_enc);
4340       need_clip = masklen < 2;
4341       break;
4342     default: assert(false, "Unhandled type, %s", type2name(bt));
4343   }
4344 
4345   // Mask generated out of partial vector comparisons/replicate/mask manipulation
4346   // operations needs to be clipped.
4347   if (need_clip && opc != Op_VectorMaskFirstTrue) {
4348     // need_clip implies masklen < 32
4349     andq(tmp, (1 << masklen) - 1);
4350   }
4351 
4352   vector_mask_operation_helper(opc, dst, tmp, masklen);
4353 }
4354 #endif
4355 
4356 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
4357   if (VM_Version::supports_avx512bw()) {
4358     if (mask_len > 32) {
4359       kmovql(dst, src);
4360     } else {
4361       kmovdl(dst, src);
4362       if (mask_len != 32) {
4363         kshiftrdl(dst, dst, 32 - mask_len);
4364       }
4365     }
4366   } else {
4367     assert(mask_len <= 16, "");
4368     kmovwl(dst, src);
4369     if (mask_len != 16) {
4370       kshiftrwl(dst, dst, 16 - mask_len);
4371     }
4372   }
4373 }
4374 
4375 
4376 //
4377 // Following is lookup table based popcount computation algorithm:-
4378 //       Index   Bit set count
4379 //     [ 0000 ->   0,
4380 //       0001 ->   1,
4381 //       0010 ->   1,
4382 //       0011 ->   2,
4383 //       0100 ->   1,
4384 //       0101 ->   2,
4385 //       0110 ->   2,
4386 //       0111 ->   3,
4387 //       1000 ->   1,
4388 //       1001 ->   2,
4389 //       1010 ->   3,
4390 //       1011 ->   3,
4391 //       1100 ->   2,
4392 //       1101 ->   3,
4393 //       1111 ->   4 ]
4394 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
4395 //     shuffle indices for lookup table access.
4396 //  b. Right shift each byte of vector lane by 4 positions.
4397 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
4398 //     shuffle indices for lookup table access.
4399 //  d. Add the bitset count of upper and lower 4 bits of each byte.
4400 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
4401 //     count of all the bytes of a quadword.
4402 //  f. Perform step e. for upper 128bit vector lane.
4403 //  g. Pack the bitset count of quadwords back to double word.
4404 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
4405 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4406                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp,
4407                                             int vec_enc) {
4408   if (VM_Version::supports_avx512_vpopcntdq()) {
4409     vpopcntd(dst, src, vec_enc);
4410   } else {
4411     assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
4412     movl(rtmp, 0x0F0F0F0F);
4413     movdl(xtmp1, rtmp);
4414     vpbroadcastd(xtmp1, xtmp1, vec_enc);
4415     if (Assembler::AVX_512bit == vec_enc) {
4416       evmovdqul(xtmp2, k0, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), false, vec_enc, rtmp);
4417     } else {
4418       vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), rtmp);
4419     }
4420     vpand(xtmp3, src, xtmp1, vec_enc);
4421     vpshufb(xtmp3, xtmp2, xtmp3, vec_enc);
4422     vpsrlw(dst, src, 4, vec_enc);
4423     vpand(dst, dst, xtmp1, vec_enc);
4424     vpshufb(dst, xtmp2, dst, vec_enc);
4425     vpaddb(xtmp3, dst, xtmp3, vec_enc);
4426     vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
4427     vpunpckhdq(dst, xtmp3, xtmp1, vec_enc);
4428     vpsadbw(dst, dst, xtmp1, vec_enc);
4429     vpunpckldq(xtmp2, xtmp3, xtmp1, vec_enc);
4430     vpsadbw(xtmp2, xtmp2, xtmp1, vec_enc);
4431     vpackuswb(dst, xtmp2, dst, vec_enc);
4432   }
4433 }
4434 
4435 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4436                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp,
4437                                              int vec_enc) {
4438   if (VM_Version::supports_avx512_vpopcntdq()) {
4439     vpopcntq(dst, src, vec_enc);
4440   } else if (vec_enc == Assembler::AVX_512bit) {
4441     assert(VM_Version::supports_avx512bw(), "");
4442     movl(rtmp, 0x0F0F0F0F);
4443     movdl(xtmp1, rtmp);
4444     vpbroadcastd(xtmp1, xtmp1, vec_enc);
4445     evmovdqul(xtmp2, k0, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), true, vec_enc, rtmp);
4446     vpandq(xtmp3, src, xtmp1, vec_enc);
4447     vpshufb(xtmp3, xtmp2, xtmp3, vec_enc);
4448     vpsrlw(dst, src, 4, vec_enc);
4449     vpandq(dst, dst, xtmp1, vec_enc);
4450     vpshufb(dst, xtmp2, dst, vec_enc);
4451     vpaddb(xtmp3, dst, xtmp3, vec_enc);
4452     vpxorq(xtmp1, xtmp1, xtmp1, vec_enc);
4453     vpsadbw(dst, xtmp3, xtmp1, vec_enc);
4454   } else {
4455     // We do not see any performance benefit of running
4456     // above instruction sequence on 256 bit vector which
4457     // can operate over maximum 4 long elements.
4458     ShouldNotReachHere();
4459   }
4460   evpmovqd(dst, dst, vec_enc);
4461 }
4462 
4463 #ifndef _LP64
4464 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
4465   assert(VM_Version::supports_avx512bw(), "");
4466   kmovdl(tmp, src);
4467   kunpckdql(dst, tmp, tmp);
4468 }
4469 #endif