1 /*
   2  * Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "oops/methodData.hpp"
  29 #include "opto/c2_MacroAssembler.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/opcodes.hpp"
  32 #include "opto/subnode.hpp"
  33 #include "runtime/objectMonitor.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 
  36 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
  37   switch (vlen_in_bytes) {
  38     case  4: // fall-through
  39     case  8: // fall-through
  40     case 16: return Assembler::AVX_128bit;
  41     case 32: return Assembler::AVX_256bit;
  42     case 64: return Assembler::AVX_512bit;
  43 
  44     default: {
  45       ShouldNotReachHere();
  46       return Assembler::AVX_NoVec;
  47     }
  48   }
  49 }
  50 
  51 void C2_MacroAssembler::setvectmask(Register dst, Register src, KRegister mask) {
  52   guarantee(PostLoopMultiversioning, "must be");
  53   Assembler::movl(dst, 1);
  54   Assembler::shlxl(dst, dst, src);
  55   Assembler::decl(dst);
  56   Assembler::kmovdl(mask, dst);
  57   Assembler::movl(dst, src);
  58 }
  59 
  60 void C2_MacroAssembler::restorevectmask(KRegister mask) {
  61   guarantee(PostLoopMultiversioning, "must be");
  62   Assembler::knotwl(mask, k0);
  63 }
  64 
  65 #if INCLUDE_RTM_OPT
  66 
  67 // Update rtm_counters based on abort status
  68 // input: abort_status
  69 //        rtm_counters (RTMLockingCounters*)
  70 // flags are killed
  71 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
  72 
  73   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
  74   if (PrintPreciseRTMLockingStatistics) {
  75     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
  76       Label check_abort;
  77       testl(abort_status, (1<<i));
  78       jccb(Assembler::equal, check_abort);
  79       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
  80       bind(check_abort);
  81     }
  82   }
  83 }
  84 
  85 // Branch if (random & (count-1) != 0), count is 2^n
  86 // tmp, scr and flags are killed
  87 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
  88   assert(tmp == rax, "");
  89   assert(scr == rdx, "");
  90   rdtsc(); // modifies EDX:EAX
  91   andptr(tmp, count-1);
  92   jccb(Assembler::notZero, brLabel);
  93 }
  94 
  95 // Perform abort ratio calculation, set no_rtm bit if high ratio
  96 // input:  rtm_counters_Reg (RTMLockingCounters* address)
  97 // tmpReg, rtm_counters_Reg and flags are killed
  98 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
  99                                                     Register rtm_counters_Reg,
 100                                                     RTMLockingCounters* rtm_counters,
 101                                                     Metadata* method_data) {
 102   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 103 
 104   if (RTMLockingCalculationDelay > 0) {
 105     // Delay calculation
 106     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
 107     testptr(tmpReg, tmpReg);
 108     jccb(Assembler::equal, L_done);
 109   }
 110   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 111   //   Aborted transactions = abort_count * 100
 112   //   All transactions = total_count *  RTMTotalCountIncrRate
 113   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 114 
 115   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 116   cmpptr(tmpReg, RTMAbortThreshold);
 117   jccb(Assembler::below, L_check_always_rtm2);
 118   imulptr(tmpReg, tmpReg, 100);
 119 
 120   Register scrReg = rtm_counters_Reg;
 121   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 122   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 123   imulptr(scrReg, scrReg, RTMAbortRatio);
 124   cmpptr(tmpReg, scrReg);
 125   jccb(Assembler::below, L_check_always_rtm1);
 126   if (method_data != NULL) {
 127     // set rtm_state to "no rtm" in MDO
 128     mov_metadata(tmpReg, method_data);
 129     lock();
 130     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
 131   }
 132   jmpb(L_done);
 133   bind(L_check_always_rtm1);
 134   // Reload RTMLockingCounters* address
 135   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 136   bind(L_check_always_rtm2);
 137   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 138   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 139   jccb(Assembler::below, L_done);
 140   if (method_data != NULL) {
 141     // set rtm_state to "always rtm" in MDO
 142     mov_metadata(tmpReg, method_data);
 143     lock();
 144     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
 145   }
 146   bind(L_done);
 147 }
 148 
 149 // Update counters and perform abort ratio calculation
 150 // input:  abort_status_Reg
 151 // rtm_counters_Reg, flags are killed
 152 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 153                                       Register rtm_counters_Reg,
 154                                       RTMLockingCounters* rtm_counters,
 155                                       Metadata* method_data,
 156                                       bool profile_rtm) {
 157 
 158   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 159   // update rtm counters based on rax value at abort
 160   // reads abort_status_Reg, updates flags
 161   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 162   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 163   if (profile_rtm) {
 164     // Save abort status because abort_status_Reg is used by following code.
 165     if (RTMRetryCount > 0) {
 166       push(abort_status_Reg);
 167     }
 168     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 169     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 170     // restore abort status
 171     if (RTMRetryCount > 0) {
 172       pop(abort_status_Reg);
 173     }
 174   }
 175 }
 176 
 177 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 178 // inputs: retry_count_Reg
 179 //       : abort_status_Reg
 180 // output: retry_count_Reg decremented by 1
 181 // flags are killed
 182 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 183   Label doneRetry;
 184   assert(abort_status_Reg == rax, "");
 185   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 186   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 187   // if reason is in 0x6 and retry count != 0 then retry
 188   andptr(abort_status_Reg, 0x6);
 189   jccb(Assembler::zero, doneRetry);
 190   testl(retry_count_Reg, retry_count_Reg);
 191   jccb(Assembler::zero, doneRetry);
 192   pause();
 193   decrementl(retry_count_Reg);
 194   jmp(retryLabel);
 195   bind(doneRetry);
 196 }
 197 
 198 // Spin and retry if lock is busy,
 199 // inputs: box_Reg (monitor address)
 200 //       : retry_count_Reg
 201 // output: retry_count_Reg decremented by 1
 202 //       : clear z flag if retry count exceeded
 203 // tmp_Reg, scr_Reg, flags are killed
 204 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 205                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 206   Label SpinLoop, SpinExit, doneRetry;
 207   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 208 
 209   testl(retry_count_Reg, retry_count_Reg);
 210   jccb(Assembler::zero, doneRetry);
 211   decrementl(retry_count_Reg);
 212   movptr(scr_Reg, RTMSpinLoopCount);
 213 
 214   bind(SpinLoop);
 215   pause();
 216   decrementl(scr_Reg);
 217   jccb(Assembler::lessEqual, SpinExit);
 218   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 219   testptr(tmp_Reg, tmp_Reg);
 220   jccb(Assembler::notZero, SpinLoop);
 221 
 222   bind(SpinExit);
 223   jmp(retryLabel);
 224   bind(doneRetry);
 225   incrementl(retry_count_Reg); // clear z flag
 226 }
 227 
 228 // Use RTM for normal stack locks
 229 // Input: objReg (object to lock)
 230 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 231                                          Register retry_on_abort_count_Reg,
 232                                          RTMLockingCounters* stack_rtm_counters,
 233                                          Metadata* method_data, bool profile_rtm,
 234                                          Label& DONE_LABEL, Label& IsInflated) {
 235   assert(UseRTMForStackLocks, "why call this otherwise?");
 236   assert(tmpReg == rax, "");
 237   assert(scrReg == rdx, "");
 238   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 239 
 240   if (RTMRetryCount > 0) {
 241     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 242     bind(L_rtm_retry);
 243   }
 244   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 245   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral
 246   jcc(Assembler::notZero, IsInflated);
 247 
 248   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 249     Label L_noincrement;
 250     if (RTMTotalCountIncrRate > 1) {
 251       // tmpReg, scrReg and flags are killed
 252       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 253     }
 254     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
 255     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 256     bind(L_noincrement);
 257   }
 258   xbegin(L_on_abort);
 259   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 260   andptr(tmpReg, markWord::lock_mask_in_place);     // look at 2 lock bits
 261   cmpptr(tmpReg, markWord::unlocked_value);         // bits = 01 unlocked
 262   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 263 
 264   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 265   if (UseRTMXendForLockBusy) {
 266     xend();
 267     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 268     jmp(L_decrement_retry);
 269   }
 270   else {
 271     xabort(0);
 272   }
 273   bind(L_on_abort);
 274   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 275     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 276   }
 277   bind(L_decrement_retry);
 278   if (RTMRetryCount > 0) {
 279     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 280     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 281   }
 282 }
 283 
 284 // Use RTM for inflating locks
 285 // inputs: objReg (object to lock)
 286 //         boxReg (on-stack box address (displaced header location) - KILLED)
 287 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 288 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 289                                             Register scrReg, Register retry_on_busy_count_Reg,
 290                                             Register retry_on_abort_count_Reg,
 291                                             RTMLockingCounters* rtm_counters,
 292                                             Metadata* method_data, bool profile_rtm,
 293                                             Label& DONE_LABEL) {
 294   assert(UseRTMLocking, "why call this otherwise?");
 295   assert(tmpReg == rax, "");
 296   assert(scrReg == rdx, "");
 297   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 298   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 299 
 300   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 301   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
 302   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 303 
 304   if (RTMRetryCount > 0) {
 305     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 306     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 307     bind(L_rtm_retry);
 308   }
 309   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 310     Label L_noincrement;
 311     if (RTMTotalCountIncrRate > 1) {
 312       // tmpReg, scrReg and flags are killed
 313       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 314     }
 315     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 316     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 317     bind(L_noincrement);
 318   }
 319   xbegin(L_on_abort);
 320   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 321   movptr(tmpReg, Address(tmpReg, owner_offset));
 322   testptr(tmpReg, tmpReg);
 323   jcc(Assembler::zero, DONE_LABEL);
 324   if (UseRTMXendForLockBusy) {
 325     xend();
 326     jmp(L_decrement_retry);
 327   }
 328   else {
 329     xabort(0);
 330   }
 331   bind(L_on_abort);
 332   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 333   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 334     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 335   }
 336   if (RTMRetryCount > 0) {
 337     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 338     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 339   }
 340 
 341   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 342   testptr(tmpReg, tmpReg) ;
 343   jccb(Assembler::notZero, L_decrement_retry) ;
 344 
 345   // Appears unlocked - try to swing _owner from null to non-null.
 346   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 347 #ifdef _LP64
 348   Register threadReg = r15_thread;
 349 #else
 350   get_thread(scrReg);
 351   Register threadReg = scrReg;
 352 #endif
 353   lock();
 354   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 355 
 356   if (RTMRetryCount > 0) {
 357     // success done else retry
 358     jccb(Assembler::equal, DONE_LABEL) ;
 359     bind(L_decrement_retry);
 360     // Spin and retry if lock is busy.
 361     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 362   }
 363   else {
 364     bind(L_decrement_retry);
 365   }
 366 }
 367 
 368 #endif //  INCLUDE_RTM_OPT
 369 
 370 // fast_lock and fast_unlock used by C2
 371 
 372 // Because the transitions from emitted code to the runtime
 373 // monitorenter/exit helper stubs are so slow it's critical that
 374 // we inline both the stack-locking fast path and the inflated fast path.
 375 //
 376 // See also: cmpFastLock and cmpFastUnlock.
 377 //
 378 // What follows is a specialized inline transliteration of the code
 379 // in enter() and exit(). If we're concerned about I$ bloat another
 380 // option would be to emit TrySlowEnter and TrySlowExit methods
 381 // at startup-time.  These methods would accept arguments as
 382 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 383 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 384 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 385 // In practice, however, the # of lock sites is bounded and is usually small.
 386 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 387 // if the processor uses simple bimodal branch predictors keyed by EIP
 388 // Since the helper routines would be called from multiple synchronization
 389 // sites.
 390 //
 391 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 392 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 393 // to those specialized methods.  That'd give us a mostly platform-independent
 394 // implementation that the JITs could optimize and inline at their pleasure.
 395 // Done correctly, the only time we'd need to cross to native could would be
 396 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 397 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 398 // (b) explicit barriers or fence operations.
 399 //
 400 // TODO:
 401 //
 402 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 403 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 404 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 405 //    the lock operators would typically be faster than reifying Self.
 406 //
 407 // *  Ideally I'd define the primitives as:
 408 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 409 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 410 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 411 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 412 //    Furthermore the register assignments are overconstrained, possibly resulting in
 413 //    sub-optimal code near the synchronization site.
 414 //
 415 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 416 //    Alternately, use a better sp-proximity test.
 417 //
 418 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 419 //    Either one is sufficient to uniquely identify a thread.
 420 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 421 //
 422 // *  Intrinsify notify() and notifyAll() for the common cases where the
 423 //    object is locked by the calling thread but the waitlist is empty.
 424 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 425 //
 426 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 427 //    But beware of excessive branch density on AMD Opterons.
 428 //
 429 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 430 //    or failure of the fast path.  If the fast path fails then we pass
 431 //    control to the slow path, typically in C.  In fast_lock and
 432 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 433 //    will emit a conditional branch immediately after the node.
 434 //    So we have branches to branches and lots of ICC.ZF games.
 435 //    Instead, it might be better to have C2 pass a "FailureLabel"
 436 //    into fast_lock and fast_unlock.  In the case of success, control
 437 //    will drop through the node.  ICC.ZF is undefined at exit.
 438 //    In the case of failure, the node will branch directly to the
 439 //    FailureLabel
 440 
 441 
 442 // obj: object to lock
 443 // box: on-stack box address (displaced header location) - KILLED
 444 // rax,: tmp -- KILLED
 445 // scr: tmp -- KILLED
 446 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 447                                  Register scrReg, Register cx1Reg, Register cx2Reg,
 448                                  RTMLockingCounters* rtm_counters,
 449                                  RTMLockingCounters* stack_rtm_counters,
 450                                  Metadata* method_data,
 451                                  bool use_rtm, bool profile_rtm) {
 452   // Ensure the register assignments are disjoint
 453   assert(tmpReg == rax, "");
 454 
 455   if (use_rtm) {
 456     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 457   } else {
 458     assert(cx2Reg == noreg, "");
 459     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 460   }
 461 
 462   // Possible cases that we'll encounter in fast_lock
 463   // ------------------------------------------------
 464   // * Inflated
 465   //    -- unlocked
 466   //    -- Locked
 467   //       = by self
 468   //       = by other
 469   // * neutral
 470   // * stack-locked
 471   //    -- by self
 472   //       = sp-proximity test hits
 473   //       = sp-proximity test generates false-negative
 474   //    -- by other
 475   //
 476 
 477   Label IsInflated, DONE_LABEL;
 478 
 479   if (DiagnoseSyncOnValueBasedClasses != 0) {
 480     load_klass(tmpReg, objReg, cx1Reg);
 481     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 482     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 483     jcc(Assembler::notZero, DONE_LABEL);
 484   }
 485 
 486 #if INCLUDE_RTM_OPT
 487   if (UseRTMForStackLocks && use_rtm) {
 488     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 489                       stack_rtm_counters, method_data, profile_rtm,
 490                       DONE_LABEL, IsInflated);
 491   }
 492 #endif // INCLUDE_RTM_OPT
 493 
 494   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 495   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 496   jccb(Assembler::notZero, IsInflated);
 497 
 498   // Attempt stack-locking ...
 499   orptr (tmpReg, markWord::unlocked_value);
 500   movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 501   lock();
 502   cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 503   jcc(Assembler::equal, DONE_LABEL);           // Success
 504 
 505   // Recursive locking.
 506   // The object is stack-locked: markword contains stack pointer to BasicLock.
 507   // Locked by current thread if difference with current SP is less than one page.
 508   subptr(tmpReg, rsp);
 509   // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 510   andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
 511   movptr(Address(boxReg, 0), tmpReg);
 512   jmp(DONE_LABEL);
 513 
 514   bind(IsInflated);
 515   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 516 
 517 #if INCLUDE_RTM_OPT
 518   // Use the same RTM locking code in 32- and 64-bit VM.
 519   if (use_rtm) {
 520     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 521                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 522   } else {
 523 #endif // INCLUDE_RTM_OPT
 524 
 525 #ifndef _LP64
 526   // The object is inflated.
 527 
 528   // boxReg refers to the on-stack BasicLock in the current frame.
 529   // We'd like to write:
 530   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 531   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 532   // additional latency as we have another ST in the store buffer that must drain.
 533 
 534   // avoid ST-before-CAS
 535   // register juggle because we need tmpReg for cmpxchgptr below
 536   movptr(scrReg, boxReg);
 537   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 538 
 539   // Optimistic form: consider XORL tmpReg,tmpReg
 540   movptr(tmpReg, NULL_WORD);
 541 
 542   // Appears unlocked - try to swing _owner from null to non-null.
 543   // Ideally, I'd manifest "Self" with get_thread and then attempt
 544   // to CAS the register containing Self into m->Owner.
 545   // But we don't have enough registers, so instead we can either try to CAS
 546   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 547   // we later store "Self" into m->Owner.  Transiently storing a stack address
 548   // (rsp or the address of the box) into  m->owner is harmless.
 549   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 550   lock();
 551   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 552   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 553   // If we weren't able to swing _owner from NULL to the BasicLock
 554   // then take the slow path.
 555   jccb  (Assembler::notZero, DONE_LABEL);
 556   // update _owner from BasicLock to thread
 557   get_thread (scrReg);                    // beware: clobbers ICCs
 558   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 559   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 560 
 561   // If the CAS fails we can either retry or pass control to the slow path.
 562   // We use the latter tactic.
 563   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 564   // If the CAS was successful ...
 565   //   Self has acquired the lock
 566   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 567   // Intentional fall-through into DONE_LABEL ...
 568 #else // _LP64
 569   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 570   movq(scrReg, tmpReg);
 571   xorq(tmpReg, tmpReg);
 572   lock();
 573   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 574   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 575   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 576   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
 577   // Intentional fall-through into DONE_LABEL ...
 578   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 579 #endif // _LP64
 580 #if INCLUDE_RTM_OPT
 581   } // use_rtm()
 582 #endif
 583   // DONE_LABEL is a hot target - we'd really like to place it at the
 584   // start of cache line by padding with NOPs.
 585   // See the AMD and Intel software optimization manuals for the
 586   // most efficient "long" NOP encodings.
 587   // Unfortunately none of our alignment mechanisms suffice.
 588   bind(DONE_LABEL);
 589 
 590   // At DONE_LABEL the icc ZFlag is set as follows ...
 591   // fast_unlock uses the same protocol.
 592   // ZFlag == 1 -> Success
 593   // ZFlag == 0 -> Failure - force control through the slow path
 594 }
 595 
 596 // obj: object to unlock
 597 // box: box address (displaced header location), killed.  Must be EAX.
 598 // tmp: killed, cannot be obj nor box.
 599 //
 600 // Some commentary on balanced locking:
 601 //
 602 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 603 // Methods that don't have provably balanced locking are forced to run in the
 604 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 605 // The interpreter provides two properties:
 606 // I1:  At return-time the interpreter automatically and quietly unlocks any
 607 //      objects acquired the current activation (frame).  Recall that the
 608 //      interpreter maintains an on-stack list of locks currently held by
 609 //      a frame.
 610 // I2:  If a method attempts to unlock an object that is not held by the
 611 //      the frame the interpreter throws IMSX.
 612 //
 613 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 614 // B() doesn't have provably balanced locking so it runs in the interpreter.
 615 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 616 // is still locked by A().
 617 //
 618 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 619 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 620 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 621 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 622 // Arguably given that the spec legislates the JNI case as undefined our implementation
 623 // could reasonably *avoid* checking owner in fast_unlock().
 624 // In the interest of performance we elide m->Owner==Self check in unlock.
 625 // A perfectly viable alternative is to elide the owner check except when
 626 // Xcheck:jni is enabled.
 627 
 628 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 629   assert(boxReg == rax, "");
 630   assert_different_registers(objReg, boxReg, tmpReg);
 631 
 632   Label DONE_LABEL, Stacked, CheckSucc;
 633 
 634 #if INCLUDE_RTM_OPT
 635   if (UseRTMForStackLocks && use_rtm) {
 636     Label L_regular_unlock;
 637     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 638     andptr(tmpReg, markWord::lock_mask_in_place);                     // look at 2 lock bits
 639     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 01 unlocked
 640     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 641     xend();                                                           // otherwise end...
 642     jmp(DONE_LABEL);                                                  // ... and we're done
 643     bind(L_regular_unlock);
 644   }
 645 #endif
 646 
 647   cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD);                   // Examine the displaced header
 648   jcc   (Assembler::zero, DONE_LABEL);                              // 0 indicates recursive stack-lock
 649   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
 650   testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 651   jccb  (Assembler::zero, Stacked);
 652 
 653   // It's inflated.
 654 #if INCLUDE_RTM_OPT
 655   if (use_rtm) {
 656     Label L_regular_inflated_unlock;
 657     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 658     movptr(boxReg, Address(tmpReg, owner_offset));
 659     testptr(boxReg, boxReg);
 660     jccb(Assembler::notZero, L_regular_inflated_unlock);
 661     xend();
 662     jmpb(DONE_LABEL);
 663     bind(L_regular_inflated_unlock);
 664   }
 665 #endif
 666 
 667   // Despite our balanced locking property we still check that m->_owner == Self
 668   // as java routines or native JNI code called by this thread might
 669   // have released the lock.
 670   // Refer to the comments in synchronizer.cpp for how we might encode extra
 671   // state in _succ so we can avoid fetching EntryList|cxq.
 672   //
 673   // I'd like to add more cases in fast_lock() and fast_unlock() --
 674   // such as recursive enter and exit -- but we have to be wary of
 675   // I$ bloat, T$ effects and BP$ effects.
 676   //
 677   // If there's no contention try a 1-0 exit.  That is, exit without
 678   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 679   // we detect and recover from the race that the 1-0 exit admits.
 680   //
 681   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 682   // before it STs null into _owner, releasing the lock.  Updates
 683   // to data protected by the critical section must be visible before
 684   // we drop the lock (and thus before any other thread could acquire
 685   // the lock and observe the fields protected by the lock).
 686   // IA32's memory-model is SPO, so STs are ordered with respect to
 687   // each other and there's no need for an explicit barrier (fence).
 688   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 689 #ifndef _LP64
 690   get_thread (boxReg);
 691 
 692   // Note that we could employ various encoding schemes to reduce
 693   // the number of loads below (currently 4) to just 2 or 3.
 694   // Refer to the comments in synchronizer.cpp.
 695   // In practice the chain of fetches doesn't seem to impact performance, however.
 696   xorptr(boxReg, boxReg);
 697   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 698   jccb  (Assembler::notZero, DONE_LABEL);
 699   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 700   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 701   jccb  (Assembler::notZero, CheckSucc);
 702   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 703   jmpb  (DONE_LABEL);
 704 
 705   bind (Stacked);
 706   // It's not inflated and it's not recursively stack-locked.
 707   // It must be stack-locked.
 708   // Try to reset the header to displaced header.
 709   // The "box" value on the stack is stable, so we can reload
 710   // and be assured we observe the same value as above.
 711   movptr(tmpReg, Address(boxReg, 0));
 712   lock();
 713   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 714   // Intention fall-thru into DONE_LABEL
 715 
 716   // DONE_LABEL is a hot target - we'd really like to place it at the
 717   // start of cache line by padding with NOPs.
 718   // See the AMD and Intel software optimization manuals for the
 719   // most efficient "long" NOP encodings.
 720   // Unfortunately none of our alignment mechanisms suffice.
 721   bind (CheckSucc);
 722 #else // _LP64
 723   // It's inflated
 724   xorptr(boxReg, boxReg);
 725   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 726   jccb  (Assembler::notZero, DONE_LABEL);
 727   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 728   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 729   jccb  (Assembler::notZero, CheckSucc);
 730   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 731   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 732   jmpb  (DONE_LABEL);
 733 
 734   // Try to avoid passing control into the slow_path ...
 735   Label LSuccess, LGoSlowPath ;
 736   bind  (CheckSucc);
 737 
 738   // The following optional optimization can be elided if necessary
 739   // Effectively: if (succ == null) goto slow path
 740   // The code reduces the window for a race, however,
 741   // and thus benefits performance.
 742   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 743   jccb  (Assembler::zero, LGoSlowPath);
 744 
 745   xorptr(boxReg, boxReg);
 746   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 747   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 748 
 749   // Memory barrier/fence
 750   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 751   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 752   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 753   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 754   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 755   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 756   lock(); addl(Address(rsp, 0), 0);
 757 
 758   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 759   jccb  (Assembler::notZero, LSuccess);
 760 
 761   // Rare inopportune interleaving - race.
 762   // The successor vanished in the small window above.
 763   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 764   // We need to ensure progress and succession.
 765   // Try to reacquire the lock.
 766   // If that fails then the new owner is responsible for succession and this
 767   // thread needs to take no further action and can exit via the fast path (success).
 768   // If the re-acquire succeeds then pass control into the slow path.
 769   // As implemented, this latter mode is horrible because we generated more
 770   // coherence traffic on the lock *and* artifically extended the critical section
 771   // length while by virtue of passing control into the slow path.
 772 
 773   // box is really RAX -- the following CMPXCHG depends on that binding
 774   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 775   lock();
 776   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 777   // There's no successor so we tried to regrab the lock.
 778   // If that didn't work, then another thread grabbed the
 779   // lock so we're done (and exit was a success).
 780   jccb  (Assembler::notEqual, LSuccess);
 781   // Intentional fall-through into slow path
 782 
 783   bind  (LGoSlowPath);
 784   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 785   jmpb  (DONE_LABEL);
 786 
 787   bind  (LSuccess);
 788   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 789   jmpb  (DONE_LABEL);
 790 
 791   bind  (Stacked);
 792   movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 793   lock();
 794   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 795 
 796 #endif
 797   bind(DONE_LABEL);
 798 }
 799 
 800 //-------------------------------------------------------------------------------------------
 801 // Generic instructions support for use in .ad files C2 code generation
 802 
 803 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
 804   if (dst != src) {
 805     movdqu(dst, src);
 806   }
 807   if (opcode == Op_AbsVD) {
 808     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr);
 809   } else {
 810     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 811     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr);
 812   }
 813 }
 814 
 815 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
 816   if (opcode == Op_AbsVD) {
 817     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr);
 818   } else {
 819     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 820     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr);
 821   }
 822 }
 823 
 824 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
 825   if (dst != src) {
 826     movdqu(dst, src);
 827   }
 828   if (opcode == Op_AbsVF) {
 829     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);
 830   } else {
 831     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 832     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);
 833   }
 834 }
 835 
 836 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
 837   if (opcode == Op_AbsVF) {
 838     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);
 839   } else {
 840     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 841     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
 842   }
 843 }
 844 
 845 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 846   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 847   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 848 
 849   if (opcode == Op_MinV) {
 850     if (elem_bt == T_BYTE) {
 851       pminsb(dst, src);
 852     } else if (elem_bt == T_SHORT) {
 853       pminsw(dst, src);
 854     } else if (elem_bt == T_INT) {
 855       pminsd(dst, src);
 856     } else {
 857       assert(elem_bt == T_LONG, "required");
 858       assert(tmp == xmm0, "required");
 859       assert_different_registers(dst, src, tmp);
 860       movdqu(xmm0, dst);
 861       pcmpgtq(xmm0, src);
 862       blendvpd(dst, src);  // xmm0 as mask
 863     }
 864   } else { // opcode == Op_MaxV
 865     if (elem_bt == T_BYTE) {
 866       pmaxsb(dst, src);
 867     } else if (elem_bt == T_SHORT) {
 868       pmaxsw(dst, src);
 869     } else if (elem_bt == T_INT) {
 870       pmaxsd(dst, src);
 871     } else {
 872       assert(elem_bt == T_LONG, "required");
 873       assert(tmp == xmm0, "required");
 874       assert_different_registers(dst, src, tmp);
 875       movdqu(xmm0, src);
 876       pcmpgtq(xmm0, dst);
 877       blendvpd(dst, src);  // xmm0 as mask
 878     }
 879   }
 880 }
 881 
 882 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 883                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 884                                  int vlen_enc) {
 885   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 886 
 887   if (opcode == Op_MinV) {
 888     if (elem_bt == T_BYTE) {
 889       vpminsb(dst, src1, src2, vlen_enc);
 890     } else if (elem_bt == T_SHORT) {
 891       vpminsw(dst, src1, src2, vlen_enc);
 892     } else if (elem_bt == T_INT) {
 893       vpminsd(dst, src1, src2, vlen_enc);
 894     } else {
 895       assert(elem_bt == T_LONG, "required");
 896       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 897         vpminsq(dst, src1, src2, vlen_enc);
 898       } else {
 899         assert_different_registers(dst, src1, src2);
 900         vpcmpgtq(dst, src1, src2, vlen_enc);
 901         vblendvpd(dst, src1, src2, dst, vlen_enc);
 902       }
 903     }
 904   } else { // opcode == Op_MaxV
 905     if (elem_bt == T_BYTE) {
 906       vpmaxsb(dst, src1, src2, vlen_enc);
 907     } else if (elem_bt == T_SHORT) {
 908       vpmaxsw(dst, src1, src2, vlen_enc);
 909     } else if (elem_bt == T_INT) {
 910       vpmaxsd(dst, src1, src2, vlen_enc);
 911     } else {
 912       assert(elem_bt == T_LONG, "required");
 913       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 914         vpmaxsq(dst, src1, src2, vlen_enc);
 915       } else {
 916         assert_different_registers(dst, src1, src2);
 917         vpcmpgtq(dst, src1, src2, vlen_enc);
 918         vblendvpd(dst, src2, src1, dst, vlen_enc);
 919       }
 920     }
 921   }
 922 }
 923 
 924 // Float/Double min max
 925 
 926 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 927                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 928                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 929                                    int vlen_enc) {
 930   assert(UseAVX > 0, "required");
 931   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 932          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 933   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 934   assert_different_registers(a, b, tmp, atmp, btmp);
 935 
 936   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 937   bool is_double_word = is_double_word_type(elem_bt);
 938 
 939   if (!is_double_word && is_min) {
 940     vblendvps(atmp, a, b, a, vlen_enc);
 941     vblendvps(btmp, b, a, a, vlen_enc);
 942     vminps(tmp, atmp, btmp, vlen_enc);
 943     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 944     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
 945   } else if (!is_double_word && !is_min) {
 946     vblendvps(btmp, b, a, b, vlen_enc);
 947     vblendvps(atmp, a, b, b, vlen_enc);
 948     vmaxps(tmp, atmp, btmp, vlen_enc);
 949     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 950     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
 951   } else if (is_double_word && is_min) {
 952     vblendvpd(atmp, a, b, a, vlen_enc);
 953     vblendvpd(btmp, b, a, a, vlen_enc);
 954     vminpd(tmp, atmp, btmp, vlen_enc);
 955     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 956     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
 957   } else {
 958     assert(is_double_word && !is_min, "sanity");
 959     vblendvpd(btmp, b, a, b, vlen_enc);
 960     vblendvpd(atmp, a, b, b, vlen_enc);
 961     vmaxpd(tmp, atmp, btmp, vlen_enc);
 962     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 963     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
 964   }
 965 }
 966 
 967 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
 968                                     XMMRegister dst, XMMRegister a, XMMRegister b,
 969                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
 970                                     int vlen_enc) {
 971   assert(UseAVX > 2, "required");
 972   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 973          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 974   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 975   assert_different_registers(dst, a, b, atmp, btmp);
 976 
 977   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 978   bool is_double_word = is_double_word_type(elem_bt);
 979   bool merge = true;
 980 
 981   if (!is_double_word && is_min) {
 982     evpmovd2m(ktmp, a, vlen_enc);
 983     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
 984     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
 985     vminps(dst, atmp, btmp, vlen_enc);
 986     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 987     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
 988   } else if (!is_double_word && !is_min) {
 989     evpmovd2m(ktmp, b, vlen_enc);
 990     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
 991     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
 992     vmaxps(dst, atmp, btmp, vlen_enc);
 993     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 994     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
 995   } else if (is_double_word && is_min) {
 996     evpmovq2m(ktmp, a, vlen_enc);
 997     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
 998     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
 999     vminpd(dst, atmp, btmp, vlen_enc);
1000     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1001     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1002   } else {
1003     assert(is_double_word && !is_min, "sanity");
1004     evpmovq2m(ktmp, b, vlen_enc);
1005     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1006     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1007     vmaxpd(dst, atmp, btmp, vlen_enc);
1008     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1009     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1010   }
1011 }
1012 
1013 // Float/Double signum
1014 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst,
1015                                   XMMRegister zero, XMMRegister one,
1016                                   Register scratch) {
1017   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1018 
1019   Label DONE_LABEL;
1020 
1021   if (opcode == Op_SignumF) {
1022     assert(UseSSE > 0, "required");
1023     ucomiss(dst, zero);
1024     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1025     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1026     movflt(dst, one);
1027     jcc(Assembler::above, DONE_LABEL);
1028     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scratch);
1029   } else if (opcode == Op_SignumD) {
1030     assert(UseSSE > 1, "required");
1031     ucomisd(dst, zero);
1032     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1033     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1034     movdbl(dst, one);
1035     jcc(Assembler::above, DONE_LABEL);
1036     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scratch);
1037   }
1038 
1039   bind(DONE_LABEL);
1040 }
1041 
1042 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1043   if (sign) {
1044     pmovsxbw(dst, src);
1045   } else {
1046     pmovzxbw(dst, src);
1047   }
1048 }
1049 
1050 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1051   if (sign) {
1052     vpmovsxbw(dst, src, vector_len);
1053   } else {
1054     vpmovzxbw(dst, src, vector_len);
1055   }
1056 }
1057 
1058 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1059   if (sign) {
1060     vpmovsxbd(dst, src, vector_len);
1061   } else {
1062     vpmovzxbd(dst, src, vector_len);
1063   }
1064 }
1065 
1066 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1067   if (sign) {
1068     vpmovsxwd(dst, src, vector_len);
1069   } else {
1070     vpmovzxwd(dst, src, vector_len);
1071   }
1072 }
1073 
1074 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1075                                      int shift, int vector_len) {
1076   if (opcode == Op_RotateLeftV) {
1077     if (etype == T_INT) {
1078       evprold(dst, src, shift, vector_len);
1079     } else {
1080       assert(etype == T_LONG, "expected type T_LONG");
1081       evprolq(dst, src, shift, vector_len);
1082     }
1083   } else {
1084     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1085     if (etype == T_INT) {
1086       evprord(dst, src, shift, vector_len);
1087     } else {
1088       assert(etype == T_LONG, "expected type T_LONG");
1089       evprorq(dst, src, shift, vector_len);
1090     }
1091   }
1092 }
1093 
1094 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1095                                      XMMRegister shift, int vector_len) {
1096   if (opcode == Op_RotateLeftV) {
1097     if (etype == T_INT) {
1098       evprolvd(dst, src, shift, vector_len);
1099     } else {
1100       assert(etype == T_LONG, "expected type T_LONG");
1101       evprolvq(dst, src, shift, vector_len);
1102     }
1103   } else {
1104     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1105     if (etype == T_INT) {
1106       evprorvd(dst, src, shift, vector_len);
1107     } else {
1108       assert(etype == T_LONG, "expected type T_LONG");
1109       evprorvq(dst, src, shift, vector_len);
1110     }
1111   }
1112 }
1113 
1114 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1115   if (opcode == Op_RShiftVI) {
1116     psrad(dst, shift);
1117   } else if (opcode == Op_LShiftVI) {
1118     pslld(dst, shift);
1119   } else {
1120     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1121     psrld(dst, shift);
1122   }
1123 }
1124 
1125 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1126   switch (opcode) {
1127     case Op_RShiftVI:  psrad(dst, shift); break;
1128     case Op_LShiftVI:  pslld(dst, shift); break;
1129     case Op_URShiftVI: psrld(dst, shift); break;
1130 
1131     default: assert(false, "%s", NodeClassNames[opcode]);
1132   }
1133 }
1134 
1135 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1136   if (opcode == Op_RShiftVI) {
1137     vpsrad(dst, nds, shift, vector_len);
1138   } else if (opcode == Op_LShiftVI) {
1139     vpslld(dst, nds, shift, vector_len);
1140   } else {
1141     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1142     vpsrld(dst, nds, shift, vector_len);
1143   }
1144 }
1145 
1146 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1147   switch (opcode) {
1148     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1149     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1150     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1151 
1152     default: assert(false, "%s", NodeClassNames[opcode]);
1153   }
1154 }
1155 
1156 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1157   switch (opcode) {
1158     case Op_RShiftVB:  // fall-through
1159     case Op_RShiftVS:  psraw(dst, shift); break;
1160 
1161     case Op_LShiftVB:  // fall-through
1162     case Op_LShiftVS:  psllw(dst, shift);   break;
1163 
1164     case Op_URShiftVS: // fall-through
1165     case Op_URShiftVB: psrlw(dst, shift);  break;
1166 
1167     default: assert(false, "%s", NodeClassNames[opcode]);
1168   }
1169 }
1170 
1171 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1172   switch (opcode) {
1173     case Op_RShiftVB:  // fall-through
1174     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1175 
1176     case Op_LShiftVB:  // fall-through
1177     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1178 
1179     case Op_URShiftVS: // fall-through
1180     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1181 
1182     default: assert(false, "%s", NodeClassNames[opcode]);
1183   }
1184 }
1185 
1186 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1187   switch (opcode) {
1188     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1189     case Op_LShiftVL:  psllq(dst, shift); break;
1190     case Op_URShiftVL: psrlq(dst, shift); break;
1191 
1192     default: assert(false, "%s", NodeClassNames[opcode]);
1193   }
1194 }
1195 
1196 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1197   if (opcode == Op_RShiftVL) {
1198     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1199   } else if (opcode == Op_LShiftVL) {
1200     psllq(dst, shift);
1201   } else {
1202     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1203     psrlq(dst, shift);
1204   }
1205 }
1206 
1207 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1208   switch (opcode) {
1209     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1210     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1211     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1212 
1213     default: assert(false, "%s", NodeClassNames[opcode]);
1214   }
1215 }
1216 
1217 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1218   if (opcode == Op_RShiftVL) {
1219     evpsraq(dst, nds, shift, vector_len);
1220   } else if (opcode == Op_LShiftVL) {
1221     vpsllq(dst, nds, shift, vector_len);
1222   } else {
1223     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1224     vpsrlq(dst, nds, shift, vector_len);
1225   }
1226 }
1227 
1228 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1229   switch (opcode) {
1230     case Op_RShiftVB:  // fall-through
1231     case Op_RShiftVS:  // fall-through
1232     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1233 
1234     case Op_LShiftVB:  // fall-through
1235     case Op_LShiftVS:  // fall-through
1236     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1237 
1238     case Op_URShiftVB: // fall-through
1239     case Op_URShiftVS: // fall-through
1240     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1241 
1242     default: assert(false, "%s", NodeClassNames[opcode]);
1243   }
1244 }
1245 
1246 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1247   switch (opcode) {
1248     case Op_RShiftVB:  // fall-through
1249     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1250 
1251     case Op_LShiftVB:  // fall-through
1252     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1253 
1254     case Op_URShiftVB: // fall-through
1255     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1256 
1257     default: assert(false, "%s", NodeClassNames[opcode]);
1258   }
1259 }
1260 
1261 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1262   assert(UseAVX >= 2, "required");
1263   switch (opcode) {
1264     case Op_RShiftVL: {
1265       if (UseAVX > 2) {
1266         assert(tmp == xnoreg, "not used");
1267         if (!VM_Version::supports_avx512vl()) {
1268           vlen_enc = Assembler::AVX_512bit;
1269         }
1270         evpsravq(dst, src, shift, vlen_enc);
1271       } else {
1272         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1273         vpsrlvq(dst, src, shift, vlen_enc);
1274         vpsrlvq(tmp, tmp, shift, vlen_enc);
1275         vpxor(dst, dst, tmp, vlen_enc);
1276         vpsubq(dst, dst, tmp, vlen_enc);
1277       }
1278       break;
1279     }
1280     case Op_LShiftVL: {
1281       assert(tmp == xnoreg, "not used");
1282       vpsllvq(dst, src, shift, vlen_enc);
1283       break;
1284     }
1285     case Op_URShiftVL: {
1286       assert(tmp == xnoreg, "not used");
1287       vpsrlvq(dst, src, shift, vlen_enc);
1288       break;
1289     }
1290     default: assert(false, "%s", NodeClassNames[opcode]);
1291   }
1292 }
1293 
1294 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1295 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1296   assert(opcode == Op_LShiftVB ||
1297          opcode == Op_RShiftVB ||
1298          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1299   bool sign = (opcode != Op_URShiftVB);
1300   assert(vector_len == 0, "required");
1301   vextendbd(sign, dst, src, 1);
1302   vpmovzxbd(vtmp, shift, 1);
1303   varshiftd(opcode, dst, dst, vtmp, 1);
1304   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch);
1305   vextracti128_high(vtmp, dst);
1306   vpackusdw(dst, dst, vtmp, 0);
1307 }
1308 
1309 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1310 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1311   assert(opcode == Op_LShiftVB ||
1312          opcode == Op_RShiftVB ||
1313          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1314   bool sign = (opcode != Op_URShiftVB);
1315   int ext_vector_len = vector_len + 1;
1316   vextendbw(sign, dst, src, ext_vector_len);
1317   vpmovzxbw(vtmp, shift, ext_vector_len);
1318   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1319   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch);
1320   if (vector_len == 0) {
1321     vextracti128_high(vtmp, dst);
1322     vpackuswb(dst, dst, vtmp, vector_len);
1323   } else {
1324     vextracti64x4_high(vtmp, dst);
1325     vpackuswb(dst, dst, vtmp, vector_len);
1326     vpermq(dst, dst, 0xD8, vector_len);
1327   }
1328 }
1329 
1330 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1331   switch(typ) {
1332     case T_BYTE:
1333       pinsrb(dst, val, idx);
1334       break;
1335     case T_SHORT:
1336       pinsrw(dst, val, idx);
1337       break;
1338     case T_INT:
1339       pinsrd(dst, val, idx);
1340       break;
1341     case T_LONG:
1342       pinsrq(dst, val, idx);
1343       break;
1344     default:
1345       assert(false,"Should not reach here.");
1346       break;
1347   }
1348 }
1349 
1350 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1351   switch(typ) {
1352     case T_BYTE:
1353       vpinsrb(dst, src, val, idx);
1354       break;
1355     case T_SHORT:
1356       vpinsrw(dst, src, val, idx);
1357       break;
1358     case T_INT:
1359       vpinsrd(dst, src, val, idx);
1360       break;
1361     case T_LONG:
1362       vpinsrq(dst, src, val, idx);
1363       break;
1364     default:
1365       assert(false,"Should not reach here.");
1366       break;
1367   }
1368 }
1369 
1370 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1371   switch(typ) {
1372     case T_INT:
1373       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1374       break;
1375     case T_FLOAT:
1376       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1377       break;
1378     case T_LONG:
1379       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1380       break;
1381     case T_DOUBLE:
1382       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1383       break;
1384     default:
1385       assert(false,"Should not reach here.");
1386       break;
1387   }
1388 }
1389 
1390 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1391   switch(typ) {
1392     case T_INT:
1393       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1394       break;
1395     case T_FLOAT:
1396       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1397       break;
1398     case T_LONG:
1399       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1400       break;
1401     case T_DOUBLE:
1402       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1403       break;
1404     default:
1405       assert(false,"Should not reach here.");
1406       break;
1407   }
1408 }
1409 
1410 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1411   switch(typ) {
1412     case T_INT:
1413       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1414       break;
1415     case T_FLOAT:
1416       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1417       break;
1418     case T_LONG:
1419       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1420       break;
1421     case T_DOUBLE:
1422       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1423       break;
1424     default:
1425       assert(false,"Should not reach here.");
1426       break;
1427   }
1428 }
1429 
1430 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1431   if (vlen_in_bytes <= 16) {
1432     pxor (dst, dst);
1433     psubb(dst, src);
1434     switch (elem_bt) {
1435       case T_BYTE:   /* nothing to do */ break;
1436       case T_SHORT:  pmovsxbw(dst, dst); break;
1437       case T_INT:    pmovsxbd(dst, dst); break;
1438       case T_FLOAT:  pmovsxbd(dst, dst); break;
1439       case T_LONG:   pmovsxbq(dst, dst); break;
1440       case T_DOUBLE: pmovsxbq(dst, dst); break;
1441 
1442       default: assert(false, "%s", type2name(elem_bt));
1443     }
1444   } else {
1445     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1446     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1447 
1448     vpxor (dst, dst, dst, vlen_enc);
1449     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1450 
1451     switch (elem_bt) {
1452       case T_BYTE:   /* nothing to do */            break;
1453       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1454       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1455       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1456       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1457       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1458 
1459       default: assert(false, "%s", type2name(elem_bt));
1460     }
1461   }
1462 }
1463 
1464 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp,
1465                                          Register tmp, bool novlbwdq, int vlen_enc) {
1466   if (novlbwdq) {
1467     vpmovsxbd(xtmp, src, vlen_enc);
1468     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1469             Assembler::eq, true, vlen_enc, tmp);
1470   } else {
1471     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1472     vpsubb(xtmp, xtmp, src, vlen_enc);
1473     evpmovb2m(dst, xtmp, vlen_enc);
1474   }
1475 }
1476 
1477 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) {
1478   ExternalAddress addr(StubRoutines::x86::vector_iota_indices());
1479   if (vlen_in_bytes == 4) {
1480     movdl(dst, addr);
1481   } else if (vlen_in_bytes == 8) {
1482     movq(dst, addr);
1483   } else if (vlen_in_bytes == 16) {
1484     movdqu(dst, addr, scratch);
1485   } else if (vlen_in_bytes == 32) {
1486     vmovdqu(dst, addr, scratch);
1487   } else {
1488     assert(vlen_in_bytes == 64, "%d", vlen_in_bytes);
1489     evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch);
1490   }
1491 }
1492 
1493 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1494 
1495 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1496   int vector_len = Assembler::AVX_128bit;
1497 
1498   switch (opcode) {
1499     case Op_AndReductionV:  pand(dst, src); break;
1500     case Op_OrReductionV:   por (dst, src); break;
1501     case Op_XorReductionV:  pxor(dst, src); break;
1502     case Op_MinReductionV:
1503       switch (typ) {
1504         case T_BYTE:        pminsb(dst, src); break;
1505         case T_SHORT:       pminsw(dst, src); break;
1506         case T_INT:         pminsd(dst, src); break;
1507         case T_LONG:        assert(UseAVX > 2, "required");
1508                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1509         default:            assert(false, "wrong type");
1510       }
1511       break;
1512     case Op_MaxReductionV:
1513       switch (typ) {
1514         case T_BYTE:        pmaxsb(dst, src); break;
1515         case T_SHORT:       pmaxsw(dst, src); break;
1516         case T_INT:         pmaxsd(dst, src); break;
1517         case T_LONG:        assert(UseAVX > 2, "required");
1518                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1519         default:            assert(false, "wrong type");
1520       }
1521       break;
1522     case Op_AddReductionVF: addss(dst, src); break;
1523     case Op_AddReductionVD: addsd(dst, src); break;
1524     case Op_AddReductionVI:
1525       switch (typ) {
1526         case T_BYTE:        paddb(dst, src); break;
1527         case T_SHORT:       paddw(dst, src); break;
1528         case T_INT:         paddd(dst, src); break;
1529         default:            assert(false, "wrong type");
1530       }
1531       break;
1532     case Op_AddReductionVL: paddq(dst, src); break;
1533     case Op_MulReductionVF: mulss(dst, src); break;
1534     case Op_MulReductionVD: mulsd(dst, src); break;
1535     case Op_MulReductionVI:
1536       switch (typ) {
1537         case T_SHORT:       pmullw(dst, src); break;
1538         case T_INT:         pmulld(dst, src); break;
1539         default:            assert(false, "wrong type");
1540       }
1541       break;
1542     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1543                             vpmullq(dst, dst, src, vector_len); break;
1544     default:                assert(false, "wrong opcode");
1545   }
1546 }
1547 
1548 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1549   int vector_len = Assembler::AVX_256bit;
1550 
1551   switch (opcode) {
1552     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1553     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1554     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1555     case Op_MinReductionV:
1556       switch (typ) {
1557         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1558         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1559         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1560         case T_LONG:        assert(UseAVX > 2, "required");
1561                             vpminsq(dst, src1, src2, vector_len); break;
1562         default:            assert(false, "wrong type");
1563       }
1564       break;
1565     case Op_MaxReductionV:
1566       switch (typ) {
1567         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1568         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1569         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1570         case T_LONG:        assert(UseAVX > 2, "required");
1571                             vpmaxsq(dst, src1, src2, vector_len); break;
1572         default:            assert(false, "wrong type");
1573       }
1574       break;
1575     case Op_AddReductionVI:
1576       switch (typ) {
1577         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1578         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1579         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1580         default:            assert(false, "wrong type");
1581       }
1582       break;
1583     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1584     case Op_MulReductionVI:
1585       switch (typ) {
1586         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1587         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1588         default:            assert(false, "wrong type");
1589       }
1590       break;
1591     case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
1592     default:                assert(false, "wrong opcode");
1593   }
1594 }
1595 
1596 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1597                                   XMMRegister dst, XMMRegister src,
1598                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1599   switch (opcode) {
1600     case Op_AddReductionVF:
1601     case Op_MulReductionVF:
1602       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1603       break;
1604 
1605     case Op_AddReductionVD:
1606     case Op_MulReductionVD:
1607       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1608       break;
1609 
1610     default: assert(false, "wrong opcode");
1611   }
1612 }
1613 
1614 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1615                              Register dst, Register src1, XMMRegister src2,
1616                              XMMRegister vtmp1, XMMRegister vtmp2) {
1617   switch (vlen) {
1618     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1619     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1620     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1621     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1622 
1623     default: assert(false, "wrong vector length");
1624   }
1625 }
1626 
1627 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1628                              Register dst, Register src1, XMMRegister src2,
1629                              XMMRegister vtmp1, XMMRegister vtmp2) {
1630   switch (vlen) {
1631     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1632     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1633     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1634     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1635 
1636     default: assert(false, "wrong vector length");
1637   }
1638 }
1639 
1640 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1641                              Register dst, Register src1, XMMRegister src2,
1642                              XMMRegister vtmp1, XMMRegister vtmp2) {
1643   switch (vlen) {
1644     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1645     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1646     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1647     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1648 
1649     default: assert(false, "wrong vector length");
1650   }
1651 }
1652 
1653 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1654                              Register dst, Register src1, XMMRegister src2,
1655                              XMMRegister vtmp1, XMMRegister vtmp2) {
1656   switch (vlen) {
1657     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1658     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1659     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1660     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1661 
1662     default: assert(false, "wrong vector length");
1663   }
1664 }
1665 
1666 #ifdef _LP64
1667 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1668                              Register dst, Register src1, XMMRegister src2,
1669                              XMMRegister vtmp1, XMMRegister vtmp2) {
1670   switch (vlen) {
1671     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1672     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1673     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1674 
1675     default: assert(false, "wrong vector length");
1676   }
1677 }
1678 #endif // _LP64
1679 
1680 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1681   switch (vlen) {
1682     case 2:
1683       assert(vtmp2 == xnoreg, "");
1684       reduce2F(opcode, dst, src, vtmp1);
1685       break;
1686     case 4:
1687       assert(vtmp2 == xnoreg, "");
1688       reduce4F(opcode, dst, src, vtmp1);
1689       break;
1690     case 8:
1691       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1692       break;
1693     case 16:
1694       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1695       break;
1696     default: assert(false, "wrong vector length");
1697   }
1698 }
1699 
1700 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1701   switch (vlen) {
1702     case 2:
1703       assert(vtmp2 == xnoreg, "");
1704       reduce2D(opcode, dst, src, vtmp1);
1705       break;
1706     case 4:
1707       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1708       break;
1709     case 8:
1710       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1711       break;
1712     default: assert(false, "wrong vector length");
1713   }
1714 }
1715 
1716 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1717   if (opcode == Op_AddReductionVI) {
1718     if (vtmp1 != src2) {
1719       movdqu(vtmp1, src2);
1720     }
1721     phaddd(vtmp1, vtmp1);
1722   } else {
1723     pshufd(vtmp1, src2, 0x1);
1724     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1725   }
1726   movdl(vtmp2, src1);
1727   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1728   movdl(dst, vtmp1);
1729 }
1730 
1731 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1732   if (opcode == Op_AddReductionVI) {
1733     if (vtmp1 != src2) {
1734       movdqu(vtmp1, src2);
1735     }
1736     phaddd(vtmp1, src2);
1737     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1738   } else {
1739     pshufd(vtmp2, src2, 0xE);
1740     reduce_operation_128(T_INT, opcode, vtmp2, src2);
1741     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1742   }
1743 }
1744 
1745 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1746   if (opcode == Op_AddReductionVI) {
1747     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1748     vextracti128_high(vtmp2, vtmp1);
1749     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1750     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1751   } else {
1752     vextracti128_high(vtmp1, src2);
1753     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1754     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1755   }
1756 }
1757 
1758 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1759   vextracti64x4_high(vtmp2, src2);
1760   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
1761   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1762 }
1763 
1764 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1765   pshufd(vtmp2, src2, 0x1);
1766   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1767   movdqu(vtmp1, vtmp2);
1768   psrldq(vtmp1, 2);
1769   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1770   movdqu(vtmp2, vtmp1);
1771   psrldq(vtmp2, 1);
1772   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1773   movdl(vtmp2, src1);
1774   pmovsxbd(vtmp1, vtmp1);
1775   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1776   pextrb(dst, vtmp1, 0x0);
1777   movsbl(dst, dst);
1778 }
1779 
1780 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1781   pshufd(vtmp1, src2, 0xE);
1782   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
1783   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1784 }
1785 
1786 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1787   vextracti128_high(vtmp2, src2);
1788   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1789   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1790 }
1791 
1792 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1793   vextracti64x4_high(vtmp1, src2);
1794   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
1795   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1796 }
1797 
1798 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1799   pmovsxbw(vtmp2, src2);
1800   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1801 }
1802 
1803 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1804   if (UseAVX > 1) {
1805     int vector_len = Assembler::AVX_256bit;
1806     vpmovsxbw(vtmp1, src2, vector_len);
1807     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1808   } else {
1809     pmovsxbw(vtmp2, src2);
1810     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1811     pshufd(vtmp2, src2, 0x1);
1812     pmovsxbw(vtmp2, src2);
1813     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1814   }
1815 }
1816 
1817 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1818   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
1819     int vector_len = Assembler::AVX_512bit;
1820     vpmovsxbw(vtmp1, src2, vector_len);
1821     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1822   } else {
1823     assert(UseAVX >= 2,"Should not reach here.");
1824     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
1825     vextracti128_high(vtmp2, src2);
1826     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1827   }
1828 }
1829 
1830 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1831   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
1832   vextracti64x4_high(vtmp2, src2);
1833   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1834 }
1835 
1836 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1837   if (opcode == Op_AddReductionVI) {
1838     if (vtmp1 != src2) {
1839       movdqu(vtmp1, src2);
1840     }
1841     phaddw(vtmp1, vtmp1);
1842     phaddw(vtmp1, vtmp1);
1843   } else {
1844     pshufd(vtmp2, src2, 0x1);
1845     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1846     movdqu(vtmp1, vtmp2);
1847     psrldq(vtmp1, 2);
1848     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
1849   }
1850   movdl(vtmp2, src1);
1851   pmovsxwd(vtmp1, vtmp1);
1852   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1853   pextrw(dst, vtmp1, 0x0);
1854   movswl(dst, dst);
1855 }
1856 
1857 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1858   if (opcode == Op_AddReductionVI) {
1859     if (vtmp1 != src2) {
1860       movdqu(vtmp1, src2);
1861     }
1862     phaddw(vtmp1, src2);
1863   } else {
1864     pshufd(vtmp1, src2, 0xE);
1865     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
1866   }
1867   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1868 }
1869 
1870 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1871   if (opcode == Op_AddReductionVI) {
1872     int vector_len = Assembler::AVX_256bit;
1873     vphaddw(vtmp2, src2, src2, vector_len);
1874     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
1875   } else {
1876     vextracti128_high(vtmp2, src2);
1877     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1878   }
1879   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1880 }
1881 
1882 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1883   int vector_len = Assembler::AVX_256bit;
1884   vextracti64x4_high(vtmp1, src2);
1885   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
1886   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1887 }
1888 
1889 #ifdef _LP64
1890 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1891   pshufd(vtmp2, src2, 0xE);
1892   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
1893   movdq(vtmp1, src1);
1894   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
1895   movdq(dst, vtmp1);
1896 }
1897 
1898 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1899   vextracti128_high(vtmp1, src2);
1900   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
1901   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1902 }
1903 
1904 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1905   vextracti64x4_high(vtmp2, src2);
1906   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
1907   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1908 }
1909 
1910 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
1911   assert(ArrayOperationPartialInlineSize > 0 && ArrayOperationPartialInlineSize <= 64, "invalid");
1912   mov64(temp, -1L);
1913   bzhiq(temp, temp, len);
1914   kmovql(dst, temp);
1915 }
1916 #endif // _LP64
1917 
1918 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1919   reduce_operation_128(T_FLOAT, opcode, dst, src);
1920   pshufd(vtmp, src, 0x1);
1921   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1922 }
1923 
1924 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1925   reduce2F(opcode, dst, src, vtmp);
1926   pshufd(vtmp, src, 0x2);
1927   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1928   pshufd(vtmp, src, 0x3);
1929   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1930 }
1931 
1932 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1933   reduce4F(opcode, dst, src, vtmp2);
1934   vextractf128_high(vtmp2, src);
1935   reduce4F(opcode, dst, vtmp2, vtmp1);
1936 }
1937 
1938 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1939   reduce8F(opcode, dst, src, vtmp1, vtmp2);
1940   vextracti64x4_high(vtmp1, src);
1941   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
1942 }
1943 
1944 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1945   reduce_operation_128(T_DOUBLE, opcode, dst, src);
1946   pshufd(vtmp, src, 0xE);
1947   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
1948 }
1949 
1950 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1951   reduce2D(opcode, dst, src, vtmp2);
1952   vextractf128_high(vtmp2, src);
1953   reduce2D(opcode, dst, vtmp2, vtmp1);
1954 }
1955 
1956 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1957   reduce4D(opcode, dst, src, vtmp1, vtmp2);
1958   vextracti64x4_high(vtmp1, src);
1959   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
1960 }
1961 
1962 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) {
1963   MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
1964 }
1965 
1966 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) {
1967   MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
1968 }
1969 
1970 
1971 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
1972                                           XMMRegister dst, XMMRegister src,
1973                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1974                                           XMMRegister xmm_0, XMMRegister xmm_1) {
1975   int permconst[] = {1, 14};
1976   XMMRegister wsrc = src;
1977   XMMRegister wdst = xmm_0;
1978   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
1979 
1980   int vlen_enc = Assembler::AVX_128bit;
1981   if (vlen == 16) {
1982     vlen_enc = Assembler::AVX_256bit;
1983   }
1984 
1985   for (int i = log2(vlen) - 1; i >=0; i--) {
1986     if (i == 0 && !is_dst_valid) {
1987       wdst = dst;
1988     }
1989     if (i == 3) {
1990       vextracti64x4_high(wtmp, wsrc);
1991     } else if (i == 2) {
1992       vextracti128_high(wtmp, wsrc);
1993     } else { // i = [0,1]
1994       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
1995     }
1996     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
1997     wsrc = wdst;
1998     vlen_enc = Assembler::AVX_128bit;
1999   }
2000   if (is_dst_valid) {
2001     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2002   }
2003 }
2004 
2005 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2006                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2007                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2008   XMMRegister wsrc = src;
2009   XMMRegister wdst = xmm_0;
2010   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2011   int vlen_enc = Assembler::AVX_128bit;
2012   if (vlen == 8) {
2013     vlen_enc = Assembler::AVX_256bit;
2014   }
2015   for (int i = log2(vlen) - 1; i >=0; i--) {
2016     if (i == 0 && !is_dst_valid) {
2017       wdst = dst;
2018     }
2019     if (i == 1) {
2020       vextracti128_high(wtmp, wsrc);
2021     } else if (i == 2) {
2022       vextracti64x4_high(wtmp, wsrc);
2023     } else {
2024       assert(i == 0, "%d", i);
2025       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2026     }
2027     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2028     wsrc = wdst;
2029     vlen_enc = Assembler::AVX_128bit;
2030   }
2031   if (is_dst_valid) {
2032     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2033   }
2034 }
2035 
2036 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2037   switch (bt) {
2038     case T_BYTE:  pextrb(dst, src, idx); break;
2039     case T_SHORT: pextrw(dst, src, idx); break;
2040     case T_INT:   pextrd(dst, src, idx); break;
2041     case T_LONG:  pextrq(dst, src, idx); break;
2042 
2043     default:
2044       assert(false,"Should not reach here.");
2045       break;
2046   }
2047 }
2048 
2049 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2050   int esize =  type2aelembytes(typ);
2051   int elem_per_lane = 16/esize;
2052   int lane = elemindex / elem_per_lane;
2053   int eindex = elemindex % elem_per_lane;
2054 
2055   if (lane >= 2) {
2056     assert(UseAVX > 2, "required");
2057     vextractf32x4(dst, src, lane & 3);
2058     return dst;
2059   } else if (lane > 0) {
2060     assert(UseAVX > 0, "required");
2061     vextractf128(dst, src, lane);
2062     return dst;
2063   } else {
2064     return src;
2065   }
2066 }
2067 
2068 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2069   int esize =  type2aelembytes(typ);
2070   int elem_per_lane = 16/esize;
2071   int eindex = elemindex % elem_per_lane;
2072   assert(is_integral_type(typ),"required");
2073 
2074   if (eindex == 0) {
2075     if (typ == T_LONG) {
2076       movq(dst, src);
2077     } else {
2078       movdl(dst, src);
2079       if (typ == T_BYTE)
2080         movsbl(dst, dst);
2081       else if (typ == T_SHORT)
2082         movswl(dst, dst);
2083     }
2084   } else {
2085     extract(typ, dst, src, eindex);
2086   }
2087 }
2088 
2089 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) {
2090   int esize =  type2aelembytes(typ);
2091   int elem_per_lane = 16/esize;
2092   int eindex = elemindex % elem_per_lane;
2093   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2094 
2095   if (eindex == 0) {
2096     movq(dst, src);
2097   } else {
2098     if (typ == T_FLOAT) {
2099       if (UseAVX == 0) {
2100         movdqu(dst, src);
2101         pshufps(dst, dst, eindex);
2102       } else {
2103         vpshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2104       }
2105     } else {
2106       if (UseAVX == 0) {
2107         movdqu(dst, src);
2108         psrldq(dst, eindex*esize);
2109       } else {
2110         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2111       }
2112       movq(dst, dst);
2113     }
2114   }
2115   // Zero upper bits
2116   if (typ == T_FLOAT) {
2117     if (UseAVX == 0) {
2118       assert((vtmp != xnoreg) && (tmp != noreg), "required.");
2119       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp);
2120       pand(dst, vtmp);
2121     } else {
2122       assert((tmp != noreg), "required.");
2123       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp);
2124     }
2125   }
2126 }
2127 
2128 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2129   switch(typ) {
2130     case T_BYTE:
2131     case T_BOOLEAN:
2132       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2133       break;
2134     case T_SHORT:
2135     case T_CHAR:
2136       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2137       break;
2138     case T_INT:
2139     case T_FLOAT:
2140       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2141       break;
2142     case T_LONG:
2143     case T_DOUBLE:
2144       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2145       break;
2146     default:
2147       assert(false,"Should not reach here.");
2148       break;
2149   }
2150 }
2151 
2152 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) {
2153   switch(typ) {
2154     case T_BOOLEAN:
2155     case T_BYTE:
2156       evpcmpb(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2157       break;
2158     case T_CHAR:
2159     case T_SHORT:
2160       evpcmpw(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2161       break;
2162     case T_INT:
2163     case T_FLOAT:
2164       evpcmpd(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2165       break;
2166     case T_LONG:
2167     case T_DOUBLE:
2168       evpcmpq(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2169       break;
2170     default:
2171       assert(false,"Should not reach here.");
2172       break;
2173   }
2174 }
2175 
2176 void C2_MacroAssembler::vpcmpu(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison,
2177                             int vlen_in_bytes, XMMRegister vtmp1, XMMRegister vtmp2, Register scratch) {
2178   int vlen_enc = vector_length_encoding(vlen_in_bytes*2);
2179   switch (typ) {
2180   case T_BYTE:
2181     vpmovzxbw(vtmp1, src1, vlen_enc);
2182     vpmovzxbw(vtmp2, src2, vlen_enc);
2183     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2184     vpacksswb(dst, dst, dst, vlen_enc);
2185     break;
2186   case T_SHORT:
2187     vpmovzxwd(vtmp1, src1, vlen_enc);
2188     vpmovzxwd(vtmp2, src2, vlen_enc);
2189     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);
2190     vpackssdw(dst, dst, dst, vlen_enc);
2191     break;
2192   case T_INT:
2193     vpmovzxdq(vtmp1, src1, vlen_enc);
2194     vpmovzxdq(vtmp2, src2, vlen_enc);
2195     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);
2196     vpermilps(dst, dst, 8, vlen_enc);
2197     break;
2198   default:
2199     assert(false, "Should not reach here");
2200   }
2201   if (vlen_in_bytes == 16) {
2202     vpermpd(dst, dst, 0x8, vlen_enc);
2203   }
2204 }
2205 
2206 void C2_MacroAssembler::vpcmpu32(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison, int vlen_in_bytes,
2207                               XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, Register scratch) {
2208   int vlen_enc = vector_length_encoding(vlen_in_bytes);
2209   switch (typ) {
2210   case T_BYTE:
2211     vpmovzxbw(vtmp1, src1, vlen_enc);
2212     vpmovzxbw(vtmp2, src2, vlen_enc);
2213     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2214     vextracti128(vtmp1, src1, 1);
2215     vextracti128(vtmp2, src2, 1);
2216     vpmovzxbw(vtmp1, vtmp1, vlen_enc);
2217     vpmovzxbw(vtmp2, vtmp2, vlen_enc);
2218     vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2219     vpacksswb(dst, dst, vtmp3, vlen_enc);
2220     vpermpd(dst, dst, 0xd8, vlen_enc);
2221     break;
2222   case T_SHORT:
2223     vpmovzxwd(vtmp1, src1, vlen_enc);
2224     vpmovzxwd(vtmp2, src2, vlen_enc);
2225     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);
2226     vextracti128(vtmp1, src1, 1);
2227     vextracti128(vtmp2, src2, 1);
2228     vpmovzxwd(vtmp1, vtmp1, vlen_enc);
2229     vpmovzxwd(vtmp2, vtmp2, vlen_enc);
2230     vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::D,  vlen_enc, scratch);
2231     vpackssdw(dst, dst, vtmp3, vlen_enc);
2232     vpermpd(dst, dst, 0xd8, vlen_enc);
2233     break;
2234   case T_INT:
2235     vpmovzxdq(vtmp1, src1, vlen_enc);
2236     vpmovzxdq(vtmp2, src2, vlen_enc);
2237     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);
2238     vpshufd(dst, dst, 8, vlen_enc);
2239     vpermq(dst, dst, 8, vlen_enc);
2240     vextracti128(vtmp1, src1, 1);
2241     vextracti128(vtmp2, src2, 1);
2242     vpmovzxdq(vtmp1, vtmp1, vlen_enc);
2243     vpmovzxdq(vtmp2, vtmp2, vlen_enc);
2244     vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::Q,  vlen_enc, scratch);
2245     vpshufd(vtmp3, vtmp3, 8, vlen_enc);
2246     vpermq(vtmp3, vtmp3, 0x80, vlen_enc);
2247     vpblendd(dst, dst, vtmp3, 0xf0, vlen_enc);
2248     break;
2249   default:
2250     assert(false, "Should not reach here");
2251   }
2252 }
2253 
2254 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2255   switch(typ) {
2256     case T_BYTE:
2257       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2258       break;
2259     case T_SHORT:
2260       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2261       break;
2262     case T_INT:
2263     case T_FLOAT:
2264       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2265       break;
2266     case T_LONG:
2267     case T_DOUBLE:
2268       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2269       break;
2270     default:
2271       assert(false,"Should not reach here.");
2272       break;
2273   }
2274 }
2275 
2276 void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2,
2277                                    XMMRegister vtmp1, XMMRegister vtmp2, KRegister mask) {
2278   switch(vlen) {
2279     case 4:
2280       assert(vtmp1 != xnoreg, "required.");
2281       // Broadcast lower 32 bits to 128 bits before ptest
2282       pshufd(vtmp1, src1, 0x0);
2283       if (bt == BoolTest::overflow) {
2284         assert(vtmp2 != xnoreg, "required.");
2285         pshufd(vtmp2, src2, 0x0);
2286       } else {
2287         assert(vtmp2 == xnoreg, "required.");
2288         vtmp2 = src2;
2289       }
2290       ptest(vtmp1, vtmp2);
2291      break;
2292     case 8:
2293       assert(vtmp1 != xnoreg, "required.");
2294       // Broadcast lower 64 bits to 128 bits before ptest
2295       pshufd(vtmp1, src1, 0x4);
2296       if (bt == BoolTest::overflow) {
2297         assert(vtmp2 != xnoreg, "required.");
2298         pshufd(vtmp2, src2, 0x4);
2299       } else {
2300         assert(vtmp2 == xnoreg, "required.");
2301         vtmp2 = src2;
2302       }
2303       ptest(vtmp1, vtmp2);
2304      break;
2305     case 16:
2306       assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2307       ptest(src1, src2);
2308       break;
2309     case 32:
2310       assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2311       vptest(src1, src2, Assembler::AVX_256bit);
2312       break;
2313     case 64:
2314       {
2315         assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2316         evpcmpeqb(mask, src1, src2, Assembler::AVX_512bit);
2317         if (bt == BoolTest::ne) {
2318           ktestql(mask, mask);
2319         } else {
2320           assert(bt == BoolTest::overflow, "required");
2321           kortestql(mask, mask);
2322         }
2323       }
2324       break;
2325     default:
2326       assert(false,"Should not reach here.");
2327       break;
2328   }
2329 }
2330 
2331 //-------------------------------------------------------------------------------------------
2332 
2333 // IndexOf for constant substrings with size >= 8 chars
2334 // which don't need to be loaded through stack.
2335 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2336                                          Register cnt1, Register cnt2,
2337                                          int int_cnt2,  Register result,
2338                                          XMMRegister vec, Register tmp,
2339                                          int ae) {
2340   ShortBranchVerifier sbv(this);
2341   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2342   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2343 
2344   // This method uses the pcmpestri instruction with bound registers
2345   //   inputs:
2346   //     xmm - substring
2347   //     rax - substring length (elements count)
2348   //     mem - scanned string
2349   //     rdx - string length (elements count)
2350   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2351   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2352   //   outputs:
2353   //     rcx - matched index in string
2354   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2355   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2356   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2357   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2358   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2359 
2360   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2361         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2362         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2363 
2364   // Note, inline_string_indexOf() generates checks:
2365   // if (substr.count > string.count) return -1;
2366   // if (substr.count == 0) return 0;
2367   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2368 
2369   // Load substring.
2370   if (ae == StrIntrinsicNode::UL) {
2371     pmovzxbw(vec, Address(str2, 0));
2372   } else {
2373     movdqu(vec, Address(str2, 0));
2374   }
2375   movl(cnt2, int_cnt2);
2376   movptr(result, str1); // string addr
2377 
2378   if (int_cnt2 > stride) {
2379     jmpb(SCAN_TO_SUBSTR);
2380 
2381     // Reload substr for rescan, this code
2382     // is executed only for large substrings (> 8 chars)
2383     bind(RELOAD_SUBSTR);
2384     if (ae == StrIntrinsicNode::UL) {
2385       pmovzxbw(vec, Address(str2, 0));
2386     } else {
2387       movdqu(vec, Address(str2, 0));
2388     }
2389     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2390 
2391     bind(RELOAD_STR);
2392     // We came here after the beginning of the substring was
2393     // matched but the rest of it was not so we need to search
2394     // again. Start from the next element after the previous match.
2395 
2396     // cnt2 is number of substring reminding elements and
2397     // cnt1 is number of string reminding elements when cmp failed.
2398     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2399     subl(cnt1, cnt2);
2400     addl(cnt1, int_cnt2);
2401     movl(cnt2, int_cnt2); // Now restore cnt2
2402 
2403     decrementl(cnt1);     // Shift to next element
2404     cmpl(cnt1, cnt2);
2405     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2406 
2407     addptr(result, (1<<scale1));
2408 
2409   } // (int_cnt2 > 8)
2410 
2411   // Scan string for start of substr in 16-byte vectors
2412   bind(SCAN_TO_SUBSTR);
2413   pcmpestri(vec, Address(result, 0), mode);
2414   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2415   subl(cnt1, stride);
2416   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2417   cmpl(cnt1, cnt2);
2418   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2419   addptr(result, 16);
2420   jmpb(SCAN_TO_SUBSTR);
2421 
2422   // Found a potential substr
2423   bind(FOUND_CANDIDATE);
2424   // Matched whole vector if first element matched (tmp(rcx) == 0).
2425   if (int_cnt2 == stride) {
2426     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2427   } else { // int_cnt2 > 8
2428     jccb(Assembler::overflow, FOUND_SUBSTR);
2429   }
2430   // After pcmpestri tmp(rcx) contains matched element index
2431   // Compute start addr of substr
2432   lea(result, Address(result, tmp, scale1));
2433 
2434   // Make sure string is still long enough
2435   subl(cnt1, tmp);
2436   cmpl(cnt1, cnt2);
2437   if (int_cnt2 == stride) {
2438     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2439   } else { // int_cnt2 > 8
2440     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2441   }
2442   // Left less then substring.
2443 
2444   bind(RET_NOT_FOUND);
2445   movl(result, -1);
2446   jmp(EXIT);
2447 
2448   if (int_cnt2 > stride) {
2449     // This code is optimized for the case when whole substring
2450     // is matched if its head is matched.
2451     bind(MATCH_SUBSTR_HEAD);
2452     pcmpestri(vec, Address(result, 0), mode);
2453     // Reload only string if does not match
2454     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2455 
2456     Label CONT_SCAN_SUBSTR;
2457     // Compare the rest of substring (> 8 chars).
2458     bind(FOUND_SUBSTR);
2459     // First 8 chars are already matched.
2460     negptr(cnt2);
2461     addptr(cnt2, stride);
2462 
2463     bind(SCAN_SUBSTR);
2464     subl(cnt1, stride);
2465     cmpl(cnt2, -stride); // Do not read beyond substring
2466     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2467     // Back-up strings to avoid reading beyond substring:
2468     // cnt1 = cnt1 - cnt2 + 8
2469     addl(cnt1, cnt2); // cnt2 is negative
2470     addl(cnt1, stride);
2471     movl(cnt2, stride); negptr(cnt2);
2472     bind(CONT_SCAN_SUBSTR);
2473     if (int_cnt2 < (int)G) {
2474       int tail_off1 = int_cnt2<<scale1;
2475       int tail_off2 = int_cnt2<<scale2;
2476       if (ae == StrIntrinsicNode::UL) {
2477         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2478       } else {
2479         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2480       }
2481       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2482     } else {
2483       // calculate index in register to avoid integer overflow (int_cnt2*2)
2484       movl(tmp, int_cnt2);
2485       addptr(tmp, cnt2);
2486       if (ae == StrIntrinsicNode::UL) {
2487         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2488       } else {
2489         movdqu(vec, Address(str2, tmp, scale2, 0));
2490       }
2491       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2492     }
2493     // Need to reload strings pointers if not matched whole vector
2494     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2495     addptr(cnt2, stride);
2496     jcc(Assembler::negative, SCAN_SUBSTR);
2497     // Fall through if found full substring
2498 
2499   } // (int_cnt2 > 8)
2500 
2501   bind(RET_FOUND);
2502   // Found result if we matched full small substring.
2503   // Compute substr offset
2504   subptr(result, str1);
2505   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2506     shrl(result, 1); // index
2507   }
2508   bind(EXIT);
2509 
2510 } // string_indexofC8
2511 
2512 // Small strings are loaded through stack if they cross page boundary.
2513 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2514                                        Register cnt1, Register cnt2,
2515                                        int int_cnt2,  Register result,
2516                                        XMMRegister vec, Register tmp,
2517                                        int ae) {
2518   ShortBranchVerifier sbv(this);
2519   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2520   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2521 
2522   //
2523   // int_cnt2 is length of small (< 8 chars) constant substring
2524   // or (-1) for non constant substring in which case its length
2525   // is in cnt2 register.
2526   //
2527   // Note, inline_string_indexOf() generates checks:
2528   // if (substr.count > string.count) return -1;
2529   // if (substr.count == 0) return 0;
2530   //
2531   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2532   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2533   // This method uses the pcmpestri instruction with bound registers
2534   //   inputs:
2535   //     xmm - substring
2536   //     rax - substring length (elements count)
2537   //     mem - scanned string
2538   //     rdx - string length (elements count)
2539   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2540   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2541   //   outputs:
2542   //     rcx - matched index in string
2543   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2544   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2545   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2546   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2547 
2548   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2549         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2550         FOUND_CANDIDATE;
2551 
2552   { //========================================================
2553     // We don't know where these strings are located
2554     // and we can't read beyond them. Load them through stack.
2555     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2556 
2557     movptr(tmp, rsp); // save old SP
2558 
2559     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2560       if (int_cnt2 == (1>>scale2)) { // One byte
2561         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2562         load_unsigned_byte(result, Address(str2, 0));
2563         movdl(vec, result); // move 32 bits
2564       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2565         // Not enough header space in 32-bit VM: 12+3 = 15.
2566         movl(result, Address(str2, -1));
2567         shrl(result, 8);
2568         movdl(vec, result); // move 32 bits
2569       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2570         load_unsigned_short(result, Address(str2, 0));
2571         movdl(vec, result); // move 32 bits
2572       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2573         movdl(vec, Address(str2, 0)); // move 32 bits
2574       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2575         movq(vec, Address(str2, 0));  // move 64 bits
2576       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2577         // Array header size is 12 bytes in 32-bit VM
2578         // + 6 bytes for 3 chars == 18 bytes,
2579         // enough space to load vec and shift.
2580         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2581         if (ae == StrIntrinsicNode::UL) {
2582           int tail_off = int_cnt2-8;
2583           pmovzxbw(vec, Address(str2, tail_off));
2584           psrldq(vec, -2*tail_off);
2585         }
2586         else {
2587           int tail_off = int_cnt2*(1<<scale2);
2588           movdqu(vec, Address(str2, tail_off-16));
2589           psrldq(vec, 16-tail_off);
2590         }
2591       }
2592     } else { // not constant substring
2593       cmpl(cnt2, stride);
2594       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2595 
2596       // We can read beyond string if srt+16 does not cross page boundary
2597       // since heaps are aligned and mapped by pages.
2598       assert(os::vm_page_size() < (int)G, "default page should be small");
2599       movl(result, str2); // We need only low 32 bits
2600       andl(result, (os::vm_page_size()-1));
2601       cmpl(result, (os::vm_page_size()-16));
2602       jccb(Assembler::belowEqual, CHECK_STR);
2603 
2604       // Move small strings to stack to allow load 16 bytes into vec.
2605       subptr(rsp, 16);
2606       int stk_offset = wordSize-(1<<scale2);
2607       push(cnt2);
2608 
2609       bind(COPY_SUBSTR);
2610       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2611         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2612         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2613       } else if (ae == StrIntrinsicNode::UU) {
2614         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2615         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2616       }
2617       decrement(cnt2);
2618       jccb(Assembler::notZero, COPY_SUBSTR);
2619 
2620       pop(cnt2);
2621       movptr(str2, rsp);  // New substring address
2622     } // non constant
2623 
2624     bind(CHECK_STR);
2625     cmpl(cnt1, stride);
2626     jccb(Assembler::aboveEqual, BIG_STRINGS);
2627 
2628     // Check cross page boundary.
2629     movl(result, str1); // We need only low 32 bits
2630     andl(result, (os::vm_page_size()-1));
2631     cmpl(result, (os::vm_page_size()-16));
2632     jccb(Assembler::belowEqual, BIG_STRINGS);
2633 
2634     subptr(rsp, 16);
2635     int stk_offset = -(1<<scale1);
2636     if (int_cnt2 < 0) { // not constant
2637       push(cnt2);
2638       stk_offset += wordSize;
2639     }
2640     movl(cnt2, cnt1);
2641 
2642     bind(COPY_STR);
2643     if (ae == StrIntrinsicNode::LL) {
2644       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2645       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2646     } else {
2647       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2648       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2649     }
2650     decrement(cnt2);
2651     jccb(Assembler::notZero, COPY_STR);
2652 
2653     if (int_cnt2 < 0) { // not constant
2654       pop(cnt2);
2655     }
2656     movptr(str1, rsp);  // New string address
2657 
2658     bind(BIG_STRINGS);
2659     // Load substring.
2660     if (int_cnt2 < 0) { // -1
2661       if (ae == StrIntrinsicNode::UL) {
2662         pmovzxbw(vec, Address(str2, 0));
2663       } else {
2664         movdqu(vec, Address(str2, 0));
2665       }
2666       push(cnt2);       // substr count
2667       push(str2);       // substr addr
2668       push(str1);       // string addr
2669     } else {
2670       // Small (< 8 chars) constant substrings are loaded already.
2671       movl(cnt2, int_cnt2);
2672     }
2673     push(tmp);  // original SP
2674 
2675   } // Finished loading
2676 
2677   //========================================================
2678   // Start search
2679   //
2680 
2681   movptr(result, str1); // string addr
2682 
2683   if (int_cnt2  < 0) {  // Only for non constant substring
2684     jmpb(SCAN_TO_SUBSTR);
2685 
2686     // SP saved at sp+0
2687     // String saved at sp+1*wordSize
2688     // Substr saved at sp+2*wordSize
2689     // Substr count saved at sp+3*wordSize
2690 
2691     // Reload substr for rescan, this code
2692     // is executed only for large substrings (> 8 chars)
2693     bind(RELOAD_SUBSTR);
2694     movptr(str2, Address(rsp, 2*wordSize));
2695     movl(cnt2, Address(rsp, 3*wordSize));
2696     if (ae == StrIntrinsicNode::UL) {
2697       pmovzxbw(vec, Address(str2, 0));
2698     } else {
2699       movdqu(vec, Address(str2, 0));
2700     }
2701     // We came here after the beginning of the substring was
2702     // matched but the rest of it was not so we need to search
2703     // again. Start from the next element after the previous match.
2704     subptr(str1, result); // Restore counter
2705     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2706       shrl(str1, 1);
2707     }
2708     addl(cnt1, str1);
2709     decrementl(cnt1);   // Shift to next element
2710     cmpl(cnt1, cnt2);
2711     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2712 
2713     addptr(result, (1<<scale1));
2714   } // non constant
2715 
2716   // Scan string for start of substr in 16-byte vectors
2717   bind(SCAN_TO_SUBSTR);
2718   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2719   pcmpestri(vec, Address(result, 0), mode);
2720   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2721   subl(cnt1, stride);
2722   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2723   cmpl(cnt1, cnt2);
2724   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2725   addptr(result, 16);
2726 
2727   bind(ADJUST_STR);
2728   cmpl(cnt1, stride); // Do not read beyond string
2729   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2730   // Back-up string to avoid reading beyond string.
2731   lea(result, Address(result, cnt1, scale1, -16));
2732   movl(cnt1, stride);
2733   jmpb(SCAN_TO_SUBSTR);
2734 
2735   // Found a potential substr
2736   bind(FOUND_CANDIDATE);
2737   // After pcmpestri tmp(rcx) contains matched element index
2738 
2739   // Make sure string is still long enough
2740   subl(cnt1, tmp);
2741   cmpl(cnt1, cnt2);
2742   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
2743   // Left less then substring.
2744 
2745   bind(RET_NOT_FOUND);
2746   movl(result, -1);
2747   jmp(CLEANUP);
2748 
2749   bind(FOUND_SUBSTR);
2750   // Compute start addr of substr
2751   lea(result, Address(result, tmp, scale1));
2752   if (int_cnt2 > 0) { // Constant substring
2753     // Repeat search for small substring (< 8 chars)
2754     // from new point without reloading substring.
2755     // Have to check that we don't read beyond string.
2756     cmpl(tmp, stride-int_cnt2);
2757     jccb(Assembler::greater, ADJUST_STR);
2758     // Fall through if matched whole substring.
2759   } else { // non constant
2760     assert(int_cnt2 == -1, "should be != 0");
2761 
2762     addl(tmp, cnt2);
2763     // Found result if we matched whole substring.
2764     cmpl(tmp, stride);
2765     jcc(Assembler::lessEqual, RET_FOUND);
2766 
2767     // Repeat search for small substring (<= 8 chars)
2768     // from new point 'str1' without reloading substring.
2769     cmpl(cnt2, stride);
2770     // Have to check that we don't read beyond string.
2771     jccb(Assembler::lessEqual, ADJUST_STR);
2772 
2773     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
2774     // Compare the rest of substring (> 8 chars).
2775     movptr(str1, result);
2776 
2777     cmpl(tmp, cnt2);
2778     // First 8 chars are already matched.
2779     jccb(Assembler::equal, CHECK_NEXT);
2780 
2781     bind(SCAN_SUBSTR);
2782     pcmpestri(vec, Address(str1, 0), mode);
2783     // Need to reload strings pointers if not matched whole vector
2784     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2785 
2786     bind(CHECK_NEXT);
2787     subl(cnt2, stride);
2788     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
2789     addptr(str1, 16);
2790     if (ae == StrIntrinsicNode::UL) {
2791       addptr(str2, 8);
2792     } else {
2793       addptr(str2, 16);
2794     }
2795     subl(cnt1, stride);
2796     cmpl(cnt2, stride); // Do not read beyond substring
2797     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
2798     // Back-up strings to avoid reading beyond substring.
2799 
2800     if (ae == StrIntrinsicNode::UL) {
2801       lea(str2, Address(str2, cnt2, scale2, -8));
2802       lea(str1, Address(str1, cnt2, scale1, -16));
2803     } else {
2804       lea(str2, Address(str2, cnt2, scale2, -16));
2805       lea(str1, Address(str1, cnt2, scale1, -16));
2806     }
2807     subl(cnt1, cnt2);
2808     movl(cnt2, stride);
2809     addl(cnt1, stride);
2810     bind(CONT_SCAN_SUBSTR);
2811     if (ae == StrIntrinsicNode::UL) {
2812       pmovzxbw(vec, Address(str2, 0));
2813     } else {
2814       movdqu(vec, Address(str2, 0));
2815     }
2816     jmp(SCAN_SUBSTR);
2817 
2818     bind(RET_FOUND_LONG);
2819     movptr(str1, Address(rsp, wordSize));
2820   } // non constant
2821 
2822   bind(RET_FOUND);
2823   // Compute substr offset
2824   subptr(result, str1);
2825   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2826     shrl(result, 1); // index
2827   }
2828   bind(CLEANUP);
2829   pop(rsp); // restore SP
2830 
2831 } // string_indexof
2832 
2833 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
2834                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
2835   ShortBranchVerifier sbv(this);
2836   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2837 
2838   int stride = 8;
2839 
2840   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
2841         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
2842         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
2843         FOUND_SEQ_CHAR, DONE_LABEL;
2844 
2845   movptr(result, str1);
2846   if (UseAVX >= 2) {
2847     cmpl(cnt1, stride);
2848     jcc(Assembler::less, SCAN_TO_CHAR);
2849     cmpl(cnt1, 2*stride);
2850     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
2851     movdl(vec1, ch);
2852     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
2853     vpxor(vec2, vec2);
2854     movl(tmp, cnt1);
2855     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
2856     andl(cnt1,0x0000000F);  //tail count (in chars)
2857 
2858     bind(SCAN_TO_16_CHAR_LOOP);
2859     vmovdqu(vec3, Address(result, 0));
2860     vpcmpeqw(vec3, vec3, vec1, 1);
2861     vptest(vec2, vec3);
2862     jcc(Assembler::carryClear, FOUND_CHAR);
2863     addptr(result, 32);
2864     subl(tmp, 2*stride);
2865     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
2866     jmp(SCAN_TO_8_CHAR);
2867     bind(SCAN_TO_8_CHAR_INIT);
2868     movdl(vec1, ch);
2869     pshuflw(vec1, vec1, 0x00);
2870     pshufd(vec1, vec1, 0);
2871     pxor(vec2, vec2);
2872   }
2873   bind(SCAN_TO_8_CHAR);
2874   cmpl(cnt1, stride);
2875   jcc(Assembler::less, SCAN_TO_CHAR);
2876   if (UseAVX < 2) {
2877     movdl(vec1, ch);
2878     pshuflw(vec1, vec1, 0x00);
2879     pshufd(vec1, vec1, 0);
2880     pxor(vec2, vec2);
2881   }
2882   movl(tmp, cnt1);
2883   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
2884   andl(cnt1,0x00000007);  //tail count (in chars)
2885 
2886   bind(SCAN_TO_8_CHAR_LOOP);
2887   movdqu(vec3, Address(result, 0));
2888   pcmpeqw(vec3, vec1);
2889   ptest(vec2, vec3);
2890   jcc(Assembler::carryClear, FOUND_CHAR);
2891   addptr(result, 16);
2892   subl(tmp, stride);
2893   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
2894   bind(SCAN_TO_CHAR);
2895   testl(cnt1, cnt1);
2896   jcc(Assembler::zero, RET_NOT_FOUND);
2897   bind(SCAN_TO_CHAR_LOOP);
2898   load_unsigned_short(tmp, Address(result, 0));
2899   cmpl(ch, tmp);
2900   jccb(Assembler::equal, FOUND_SEQ_CHAR);
2901   addptr(result, 2);
2902   subl(cnt1, 1);
2903   jccb(Assembler::zero, RET_NOT_FOUND);
2904   jmp(SCAN_TO_CHAR_LOOP);
2905 
2906   bind(RET_NOT_FOUND);
2907   movl(result, -1);
2908   jmpb(DONE_LABEL);
2909 
2910   bind(FOUND_CHAR);
2911   if (UseAVX >= 2) {
2912     vpmovmskb(tmp, vec3);
2913   } else {
2914     pmovmskb(tmp, vec3);
2915   }
2916   bsfl(ch, tmp);
2917   addptr(result, ch);
2918 
2919   bind(FOUND_SEQ_CHAR);
2920   subptr(result, str1);
2921   shrl(result, 1);
2922 
2923   bind(DONE_LABEL);
2924 } // string_indexof_char
2925 
2926 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
2927                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
2928   ShortBranchVerifier sbv(this);
2929   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2930 
2931   int stride = 16;
2932 
2933   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
2934         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
2935         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
2936         FOUND_SEQ_CHAR, DONE_LABEL;
2937 
2938   movptr(result, str1);
2939   if (UseAVX >= 2) {
2940     cmpl(cnt1, stride);
2941     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
2942     cmpl(cnt1, stride*2);
2943     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
2944     movdl(vec1, ch);
2945     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
2946     vpxor(vec2, vec2);
2947     movl(tmp, cnt1);
2948     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
2949     andl(cnt1,0x0000001F);  //tail count (in chars)
2950 
2951     bind(SCAN_TO_32_CHAR_LOOP);
2952     vmovdqu(vec3, Address(result, 0));
2953     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
2954     vptest(vec2, vec3);
2955     jcc(Assembler::carryClear, FOUND_CHAR);
2956     addptr(result, 32);
2957     subl(tmp, stride*2);
2958     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
2959     jmp(SCAN_TO_16_CHAR);
2960 
2961     bind(SCAN_TO_16_CHAR_INIT);
2962     movdl(vec1, ch);
2963     pxor(vec2, vec2);
2964     pshufb(vec1, vec2);
2965   }
2966 
2967   bind(SCAN_TO_16_CHAR);
2968   cmpl(cnt1, stride);
2969   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entires left
2970   if (UseAVX < 2) {
2971     movdl(vec1, ch);
2972     pxor(vec2, vec2);
2973     pshufb(vec1, vec2);
2974   }
2975   movl(tmp, cnt1);
2976   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
2977   andl(cnt1,0x0000000F);  //tail count (in bytes)
2978 
2979   bind(SCAN_TO_16_CHAR_LOOP);
2980   movdqu(vec3, Address(result, 0));
2981   pcmpeqb(vec3, vec1);
2982   ptest(vec2, vec3);
2983   jcc(Assembler::carryClear, FOUND_CHAR);
2984   addptr(result, 16);
2985   subl(tmp, stride);
2986   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
2987 
2988   bind(SCAN_TO_CHAR_INIT);
2989   testl(cnt1, cnt1);
2990   jcc(Assembler::zero, RET_NOT_FOUND);
2991   bind(SCAN_TO_CHAR_LOOP);
2992   load_unsigned_byte(tmp, Address(result, 0));
2993   cmpl(ch, tmp);
2994   jccb(Assembler::equal, FOUND_SEQ_CHAR);
2995   addptr(result, 1);
2996   subl(cnt1, 1);
2997   jccb(Assembler::zero, RET_NOT_FOUND);
2998   jmp(SCAN_TO_CHAR_LOOP);
2999 
3000   bind(RET_NOT_FOUND);
3001   movl(result, -1);
3002   jmpb(DONE_LABEL);
3003 
3004   bind(FOUND_CHAR);
3005   if (UseAVX >= 2) {
3006     vpmovmskb(tmp, vec3);
3007   } else {
3008     pmovmskb(tmp, vec3);
3009   }
3010   bsfl(ch, tmp);
3011   addptr(result, ch);
3012 
3013   bind(FOUND_SEQ_CHAR);
3014   subptr(result, str1);
3015 
3016   bind(DONE_LABEL);
3017 } // stringL_indexof_char
3018 
3019 // helper function for string_compare
3020 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3021                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3022                                            Address::ScaleFactor scale2, Register index, int ae) {
3023   if (ae == StrIntrinsicNode::LL) {
3024     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3025     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3026   } else if (ae == StrIntrinsicNode::UU) {
3027     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3028     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3029   } else {
3030     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3031     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3032   }
3033 }
3034 
3035 // Compare strings, used for char[] and byte[].
3036 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3037                                        Register cnt1, Register cnt2, Register result,
3038                                        XMMRegister vec1, int ae, KRegister mask) {
3039   ShortBranchVerifier sbv(this);
3040   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3041   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3042   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3043   int stride2x2 = 0x40;
3044   Address::ScaleFactor scale = Address::no_scale;
3045   Address::ScaleFactor scale1 = Address::no_scale;
3046   Address::ScaleFactor scale2 = Address::no_scale;
3047 
3048   if (ae != StrIntrinsicNode::LL) {
3049     stride2x2 = 0x20;
3050   }
3051 
3052   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3053     shrl(cnt2, 1);
3054   }
3055   // Compute the minimum of the string lengths and the
3056   // difference of the string lengths (stack).
3057   // Do the conditional move stuff
3058   movl(result, cnt1);
3059   subl(cnt1, cnt2);
3060   push(cnt1);
3061   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3062 
3063   // Is the minimum length zero?
3064   testl(cnt2, cnt2);
3065   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3066   if (ae == StrIntrinsicNode::LL) {
3067     // Load first bytes
3068     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3069     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3070   } else if (ae == StrIntrinsicNode::UU) {
3071     // Load first characters
3072     load_unsigned_short(result, Address(str1, 0));
3073     load_unsigned_short(cnt1, Address(str2, 0));
3074   } else {
3075     load_unsigned_byte(result, Address(str1, 0));
3076     load_unsigned_short(cnt1, Address(str2, 0));
3077   }
3078   subl(result, cnt1);
3079   jcc(Assembler::notZero,  POP_LABEL);
3080 
3081   if (ae == StrIntrinsicNode::UU) {
3082     // Divide length by 2 to get number of chars
3083     shrl(cnt2, 1);
3084   }
3085   cmpl(cnt2, 1);
3086   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3087 
3088   // Check if the strings start at the same location and setup scale and stride
3089   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3090     cmpptr(str1, str2);
3091     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3092     if (ae == StrIntrinsicNode::LL) {
3093       scale = Address::times_1;
3094       stride = 16;
3095     } else {
3096       scale = Address::times_2;
3097       stride = 8;
3098     }
3099   } else {
3100     scale1 = Address::times_1;
3101     scale2 = Address::times_2;
3102     // scale not used
3103     stride = 8;
3104   }
3105 
3106   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3107     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3108     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3109     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3110     Label COMPARE_TAIL_LONG;
3111     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3112 
3113     int pcmpmask = 0x19;
3114     if (ae == StrIntrinsicNode::LL) {
3115       pcmpmask &= ~0x01;
3116     }
3117 
3118     // Setup to compare 16-chars (32-bytes) vectors,
3119     // start from first character again because it has aligned address.
3120     if (ae == StrIntrinsicNode::LL) {
3121       stride2 = 32;
3122     } else {
3123       stride2 = 16;
3124     }
3125     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3126       adr_stride = stride << scale;
3127     } else {
3128       adr_stride1 = 8;  //stride << scale1;
3129       adr_stride2 = 16; //stride << scale2;
3130     }
3131 
3132     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3133     // rax and rdx are used by pcmpestri as elements counters
3134     movl(result, cnt2);
3135     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3136     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3137 
3138     // fast path : compare first 2 8-char vectors.
3139     bind(COMPARE_16_CHARS);
3140     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3141       movdqu(vec1, Address(str1, 0));
3142     } else {
3143       pmovzxbw(vec1, Address(str1, 0));
3144     }
3145     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3146     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3147 
3148     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3149       movdqu(vec1, Address(str1, adr_stride));
3150       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3151     } else {
3152       pmovzxbw(vec1, Address(str1, adr_stride1));
3153       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3154     }
3155     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3156     addl(cnt1, stride);
3157 
3158     // Compare the characters at index in cnt1
3159     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3160     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3161     subl(result, cnt2);
3162     jmp(POP_LABEL);
3163 
3164     // Setup the registers to start vector comparison loop
3165     bind(COMPARE_WIDE_VECTORS);
3166     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3167       lea(str1, Address(str1, result, scale));
3168       lea(str2, Address(str2, result, scale));
3169     } else {
3170       lea(str1, Address(str1, result, scale1));
3171       lea(str2, Address(str2, result, scale2));
3172     }
3173     subl(result, stride2);
3174     subl(cnt2, stride2);
3175     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3176     negptr(result);
3177 
3178     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3179     bind(COMPARE_WIDE_VECTORS_LOOP);
3180 
3181 #ifdef _LP64
3182     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3183       cmpl(cnt2, stride2x2);
3184       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3185       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3186       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3187 
3188       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3189       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3190         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3191         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3192       } else {
3193         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3194         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3195       }
3196       kortestql(mask, mask);
3197       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3198       addptr(result, stride2x2);  // update since we already compared at this addr
3199       subl(cnt2, stride2x2);      // and sub the size too
3200       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3201 
3202       vpxor(vec1, vec1);
3203       jmpb(COMPARE_WIDE_TAIL);
3204     }//if (VM_Version::supports_avx512vlbw())
3205 #endif // _LP64
3206 
3207 
3208     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3209     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3210       vmovdqu(vec1, Address(str1, result, scale));
3211       vpxor(vec1, Address(str2, result, scale));
3212     } else {
3213       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3214       vpxor(vec1, Address(str2, result, scale2));
3215     }
3216     vptest(vec1, vec1);
3217     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3218     addptr(result, stride2);
3219     subl(cnt2, stride2);
3220     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3221     // clean upper bits of YMM registers
3222     vpxor(vec1, vec1);
3223 
3224     // compare wide vectors tail
3225     bind(COMPARE_WIDE_TAIL);
3226     testptr(result, result);
3227     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3228 
3229     movl(result, stride2);
3230     movl(cnt2, result);
3231     negptr(result);
3232     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3233 
3234     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3235     bind(VECTOR_NOT_EQUAL);
3236     // clean upper bits of YMM registers
3237     vpxor(vec1, vec1);
3238     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3239       lea(str1, Address(str1, result, scale));
3240       lea(str2, Address(str2, result, scale));
3241     } else {
3242       lea(str1, Address(str1, result, scale1));
3243       lea(str2, Address(str2, result, scale2));
3244     }
3245     jmp(COMPARE_16_CHARS);
3246 
3247     // Compare tail chars, length between 1 to 15 chars
3248     bind(COMPARE_TAIL_LONG);
3249     movl(cnt2, result);
3250     cmpl(cnt2, stride);
3251     jcc(Assembler::less, COMPARE_SMALL_STR);
3252 
3253     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3254       movdqu(vec1, Address(str1, 0));
3255     } else {
3256       pmovzxbw(vec1, Address(str1, 0));
3257     }
3258     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3259     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3260     subptr(cnt2, stride);
3261     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3262     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3263       lea(str1, Address(str1, result, scale));
3264       lea(str2, Address(str2, result, scale));
3265     } else {
3266       lea(str1, Address(str1, result, scale1));
3267       lea(str2, Address(str2, result, scale2));
3268     }
3269     negptr(cnt2);
3270     jmpb(WHILE_HEAD_LABEL);
3271 
3272     bind(COMPARE_SMALL_STR);
3273   } else if (UseSSE42Intrinsics) {
3274     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3275     int pcmpmask = 0x19;
3276     // Setup to compare 8-char (16-byte) vectors,
3277     // start from first character again because it has aligned address.
3278     movl(result, cnt2);
3279     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3280     if (ae == StrIntrinsicNode::LL) {
3281       pcmpmask &= ~0x01;
3282     }
3283     jcc(Assembler::zero, COMPARE_TAIL);
3284     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3285       lea(str1, Address(str1, result, scale));
3286       lea(str2, Address(str2, result, scale));
3287     } else {
3288       lea(str1, Address(str1, result, scale1));
3289       lea(str2, Address(str2, result, scale2));
3290     }
3291     negptr(result);
3292 
3293     // pcmpestri
3294     //   inputs:
3295     //     vec1- substring
3296     //     rax - negative string length (elements count)
3297     //     mem - scanned string
3298     //     rdx - string length (elements count)
3299     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3300     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3301     //   outputs:
3302     //     rcx - first mismatched element index
3303     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3304 
3305     bind(COMPARE_WIDE_VECTORS);
3306     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3307       movdqu(vec1, Address(str1, result, scale));
3308       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3309     } else {
3310       pmovzxbw(vec1, Address(str1, result, scale1));
3311       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3312     }
3313     // After pcmpestri cnt1(rcx) contains mismatched element index
3314 
3315     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3316     addptr(result, stride);
3317     subptr(cnt2, stride);
3318     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3319 
3320     // compare wide vectors tail
3321     testptr(result, result);
3322     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3323 
3324     movl(cnt2, stride);
3325     movl(result, stride);
3326     negptr(result);
3327     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3328       movdqu(vec1, Address(str1, result, scale));
3329       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3330     } else {
3331       pmovzxbw(vec1, Address(str1, result, scale1));
3332       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3333     }
3334     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3335 
3336     // Mismatched characters in the vectors
3337     bind(VECTOR_NOT_EQUAL);
3338     addptr(cnt1, result);
3339     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3340     subl(result, cnt2);
3341     jmpb(POP_LABEL);
3342 
3343     bind(COMPARE_TAIL); // limit is zero
3344     movl(cnt2, result);
3345     // Fallthru to tail compare
3346   }
3347   // Shift str2 and str1 to the end of the arrays, negate min
3348   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3349     lea(str1, Address(str1, cnt2, scale));
3350     lea(str2, Address(str2, cnt2, scale));
3351   } else {
3352     lea(str1, Address(str1, cnt2, scale1));
3353     lea(str2, Address(str2, cnt2, scale2));
3354   }
3355   decrementl(cnt2);  // first character was compared already
3356   negptr(cnt2);
3357 
3358   // Compare the rest of the elements
3359   bind(WHILE_HEAD_LABEL);
3360   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3361   subl(result, cnt1);
3362   jccb(Assembler::notZero, POP_LABEL);
3363   increment(cnt2);
3364   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3365 
3366   // Strings are equal up to min length.  Return the length difference.
3367   bind(LENGTH_DIFF_LABEL);
3368   pop(result);
3369   if (ae == StrIntrinsicNode::UU) {
3370     // Divide diff by 2 to get number of chars
3371     sarl(result, 1);
3372   }
3373   jmpb(DONE_LABEL);
3374 
3375 #ifdef _LP64
3376   if (VM_Version::supports_avx512vlbw()) {
3377 
3378     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3379 
3380     kmovql(cnt1, mask);
3381     notq(cnt1);
3382     bsfq(cnt2, cnt1);
3383     if (ae != StrIntrinsicNode::LL) {
3384       // Divide diff by 2 to get number of chars
3385       sarl(cnt2, 1);
3386     }
3387     addq(result, cnt2);
3388     if (ae == StrIntrinsicNode::LL) {
3389       load_unsigned_byte(cnt1, Address(str2, result));
3390       load_unsigned_byte(result, Address(str1, result));
3391     } else if (ae == StrIntrinsicNode::UU) {
3392       load_unsigned_short(cnt1, Address(str2, result, scale));
3393       load_unsigned_short(result, Address(str1, result, scale));
3394     } else {
3395       load_unsigned_short(cnt1, Address(str2, result, scale2));
3396       load_unsigned_byte(result, Address(str1, result, scale1));
3397     }
3398     subl(result, cnt1);
3399     jmpb(POP_LABEL);
3400   }//if (VM_Version::supports_avx512vlbw())
3401 #endif // _LP64
3402 
3403   // Discard the stored length difference
3404   bind(POP_LABEL);
3405   pop(cnt1);
3406 
3407   // That's it
3408   bind(DONE_LABEL);
3409   if(ae == StrIntrinsicNode::UL) {
3410     negl(result);
3411   }
3412 
3413 }
3414 
3415 // Search for Non-ASCII character (Negative byte value) in a byte array,
3416 // return true if it has any and false otherwise.
3417 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3418 //   @IntrinsicCandidate
3419 //   private static boolean hasNegatives(byte[] ba, int off, int len) {
3420 //     for (int i = off; i < off + len; i++) {
3421 //       if (ba[i] < 0) {
3422 //         return true;
3423 //       }
3424 //     }
3425 //     return false;
3426 //   }
3427 void C2_MacroAssembler::has_negatives(Register ary1, Register len,
3428   Register result, Register tmp1,
3429   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3430   // rsi: byte array
3431   // rcx: len
3432   // rax: result
3433   ShortBranchVerifier sbv(this);
3434   assert_different_registers(ary1, len, result, tmp1);
3435   assert_different_registers(vec1, vec2);
3436   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3437 
3438   // len == 0
3439   testl(len, len);
3440   jcc(Assembler::zero, FALSE_LABEL);
3441 
3442   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3443     VM_Version::supports_avx512vlbw() &&
3444     VM_Version::supports_bmi2()) {
3445 
3446     Label test_64_loop, test_tail;
3447     Register tmp3_aliased = len;
3448 
3449     movl(tmp1, len);
3450     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3451 
3452     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
3453     andl(len, ~(64 - 1));    // vector count (in chars)
3454     jccb(Assembler::zero, test_tail);
3455 
3456     lea(ary1, Address(ary1, len, Address::times_1));
3457     negptr(len);
3458 
3459     bind(test_64_loop);
3460     // Check whether our 64 elements of size byte contain negatives
3461     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3462     kortestql(mask1, mask1);
3463     jcc(Assembler::notZero, TRUE_LABEL);
3464 
3465     addptr(len, 64);
3466     jccb(Assembler::notZero, test_64_loop);
3467 
3468 
3469     bind(test_tail);
3470     // bail out when there is nothing to be done
3471     testl(tmp1, -1);
3472     jcc(Assembler::zero, FALSE_LABEL);
3473 
3474     // ~(~0 << len) applied up to two times (for 32-bit scenario)
3475 #ifdef _LP64
3476     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3477     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3478     notq(tmp3_aliased);
3479     kmovql(mask2, tmp3_aliased);
3480 #else
3481     Label k_init;
3482     jmp(k_init);
3483 
3484     // We could not read 64-bits from a general purpose register thus we move
3485     // data required to compose 64 1's to the instruction stream
3486     // We emit 64 byte wide series of elements from 0..63 which later on would
3487     // be used as a compare targets with tail count contained in tmp1 register.
3488     // Result would be a k register having tmp1 consecutive number or 1
3489     // counting from least significant bit.
3490     address tmp = pc();
3491     emit_int64(0x0706050403020100);
3492     emit_int64(0x0F0E0D0C0B0A0908);
3493     emit_int64(0x1716151413121110);
3494     emit_int64(0x1F1E1D1C1B1A1918);
3495     emit_int64(0x2726252423222120);
3496     emit_int64(0x2F2E2D2C2B2A2928);
3497     emit_int64(0x3736353433323130);
3498     emit_int64(0x3F3E3D3C3B3A3938);
3499 
3500     bind(k_init);
3501     lea(len, InternalAddress(tmp));
3502     // create mask to test for negative byte inside a vector
3503     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3504     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
3505 
3506 #endif
3507     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3508     ktestq(mask1, mask2);
3509     jcc(Assembler::notZero, TRUE_LABEL);
3510 
3511     jmp(FALSE_LABEL);
3512   } else {
3513     movl(result, len); // copy
3514 
3515     if (UseAVX >= 2 && UseSSE >= 2) {
3516       // With AVX2, use 32-byte vector compare
3517       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3518 
3519       // Compare 32-byte vectors
3520       andl(result, 0x0000001f);  //   tail count (in bytes)
3521       andl(len, 0xffffffe0);   // vector count (in bytes)
3522       jccb(Assembler::zero, COMPARE_TAIL);
3523 
3524       lea(ary1, Address(ary1, len, Address::times_1));
3525       negptr(len);
3526 
3527       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
3528       movdl(vec2, tmp1);
3529       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
3530 
3531       bind(COMPARE_WIDE_VECTORS);
3532       vmovdqu(vec1, Address(ary1, len, Address::times_1));
3533       vptest(vec1, vec2);
3534       jccb(Assembler::notZero, TRUE_LABEL);
3535       addptr(len, 32);
3536       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3537 
3538       testl(result, result);
3539       jccb(Assembler::zero, FALSE_LABEL);
3540 
3541       vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3542       vptest(vec1, vec2);
3543       jccb(Assembler::notZero, TRUE_LABEL);
3544       jmpb(FALSE_LABEL);
3545 
3546       bind(COMPARE_TAIL); // len is zero
3547       movl(len, result);
3548       // Fallthru to tail compare
3549     } else if (UseSSE42Intrinsics) {
3550       // With SSE4.2, use double quad vector compare
3551       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3552 
3553       // Compare 16-byte vectors
3554       andl(result, 0x0000000f);  //   tail count (in bytes)
3555       andl(len, 0xfffffff0);   // vector count (in bytes)
3556       jcc(Assembler::zero, COMPARE_TAIL);
3557 
3558       lea(ary1, Address(ary1, len, Address::times_1));
3559       negptr(len);
3560 
3561       movl(tmp1, 0x80808080);
3562       movdl(vec2, tmp1);
3563       pshufd(vec2, vec2, 0);
3564 
3565       bind(COMPARE_WIDE_VECTORS);
3566       movdqu(vec1, Address(ary1, len, Address::times_1));
3567       ptest(vec1, vec2);
3568       jcc(Assembler::notZero, TRUE_LABEL);
3569       addptr(len, 16);
3570       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3571 
3572       testl(result, result);
3573       jcc(Assembler::zero, FALSE_LABEL);
3574 
3575       movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3576       ptest(vec1, vec2);
3577       jccb(Assembler::notZero, TRUE_LABEL);
3578       jmpb(FALSE_LABEL);
3579 
3580       bind(COMPARE_TAIL); // len is zero
3581       movl(len, result);
3582       // Fallthru to tail compare
3583     }
3584   }
3585   // Compare 4-byte vectors
3586   andl(len, 0xfffffffc); // vector count (in bytes)
3587   jccb(Assembler::zero, COMPARE_CHAR);
3588 
3589   lea(ary1, Address(ary1, len, Address::times_1));
3590   negptr(len);
3591 
3592   bind(COMPARE_VECTORS);
3593   movl(tmp1, Address(ary1, len, Address::times_1));
3594   andl(tmp1, 0x80808080);
3595   jccb(Assembler::notZero, TRUE_LABEL);
3596   addptr(len, 4);
3597   jcc(Assembler::notZero, COMPARE_VECTORS);
3598 
3599   // Compare trailing char (final 2 bytes), if any
3600   bind(COMPARE_CHAR);
3601   testl(result, 0x2);   // tail  char
3602   jccb(Assembler::zero, COMPARE_BYTE);
3603   load_unsigned_short(tmp1, Address(ary1, 0));
3604   andl(tmp1, 0x00008080);
3605   jccb(Assembler::notZero, TRUE_LABEL);
3606   subptr(result, 2);
3607   lea(ary1, Address(ary1, 2));
3608 
3609   bind(COMPARE_BYTE);
3610   testl(result, 0x1);   // tail  byte
3611   jccb(Assembler::zero, FALSE_LABEL);
3612   load_unsigned_byte(tmp1, Address(ary1, 0));
3613   andl(tmp1, 0x00000080);
3614   jccb(Assembler::notEqual, TRUE_LABEL);
3615   jmpb(FALSE_LABEL);
3616 
3617   bind(TRUE_LABEL);
3618   movl(result, 1);   // return true
3619   jmpb(DONE);
3620 
3621   bind(FALSE_LABEL);
3622   xorl(result, result); // return false
3623 
3624   // That's it
3625   bind(DONE);
3626   if (UseAVX >= 2 && UseSSE >= 2) {
3627     // clean upper bits of YMM registers
3628     vpxor(vec1, vec1);
3629     vpxor(vec2, vec2);
3630   }
3631 }
3632 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
3633 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
3634                                       Register limit, Register result, Register chr,
3635                                       XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
3636   ShortBranchVerifier sbv(this);
3637   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
3638 
3639   int length_offset  = arrayOopDesc::length_offset_in_bytes();
3640   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
3641 
3642   if (is_array_equ) {
3643     // Check the input args
3644     cmpoop(ary1, ary2);
3645     jcc(Assembler::equal, TRUE_LABEL);
3646 
3647     // Need additional checks for arrays_equals.
3648     testptr(ary1, ary1);
3649     jcc(Assembler::zero, FALSE_LABEL);
3650     testptr(ary2, ary2);
3651     jcc(Assembler::zero, FALSE_LABEL);
3652 
3653     // Check the lengths
3654     movl(limit, Address(ary1, length_offset));
3655     cmpl(limit, Address(ary2, length_offset));
3656     jcc(Assembler::notEqual, FALSE_LABEL);
3657   }
3658 
3659   // count == 0
3660   testl(limit, limit);
3661   jcc(Assembler::zero, TRUE_LABEL);
3662 
3663   if (is_array_equ) {
3664     // Load array address
3665     lea(ary1, Address(ary1, base_offset));
3666     lea(ary2, Address(ary2, base_offset));
3667   }
3668 
3669   if (is_array_equ && is_char) {
3670     // arrays_equals when used for char[].
3671     shll(limit, 1);      // byte count != 0
3672   }
3673   movl(result, limit); // copy
3674 
3675   if (UseAVX >= 2) {
3676     // With AVX2, use 32-byte vector compare
3677     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3678 
3679     // Compare 32-byte vectors
3680     andl(result, 0x0000001f);  //   tail count (in bytes)
3681     andl(limit, 0xffffffe0);   // vector count (in bytes)
3682     jcc(Assembler::zero, COMPARE_TAIL);
3683 
3684     lea(ary1, Address(ary1, limit, Address::times_1));
3685     lea(ary2, Address(ary2, limit, Address::times_1));
3686     negptr(limit);
3687 
3688 #ifdef _LP64
3689     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3690       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
3691 
3692       cmpl(limit, -64);
3693       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3694 
3695       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3696 
3697       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
3698       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
3699       kortestql(mask, mask);
3700       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3701       addptr(limit, 64);  // update since we already compared at this addr
3702       cmpl(limit, -64);
3703       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3704 
3705       // At this point we may still need to compare -limit+result bytes.
3706       // We could execute the next two instruction and just continue via non-wide path:
3707       //  cmpl(limit, 0);
3708       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
3709       // But since we stopped at the points ary{1,2}+limit which are
3710       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
3711       // (|limit| <= 32 and result < 32),
3712       // we may just compare the last 64 bytes.
3713       //
3714       addptr(result, -64);   // it is safe, bc we just came from this area
3715       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
3716       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
3717       kortestql(mask, mask);
3718       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3719 
3720       jmp(TRUE_LABEL);
3721 
3722       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3723 
3724     }//if (VM_Version::supports_avx512vlbw())
3725 #endif //_LP64
3726     bind(COMPARE_WIDE_VECTORS);
3727     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
3728     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
3729     vpxor(vec1, vec2);
3730 
3731     vptest(vec1, vec1);
3732     jcc(Assembler::notZero, FALSE_LABEL);
3733     addptr(limit, 32);
3734     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3735 
3736     testl(result, result);
3737     jcc(Assembler::zero, TRUE_LABEL);
3738 
3739     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3740     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
3741     vpxor(vec1, vec2);
3742 
3743     vptest(vec1, vec1);
3744     jccb(Assembler::notZero, FALSE_LABEL);
3745     jmpb(TRUE_LABEL);
3746 
3747     bind(COMPARE_TAIL); // limit is zero
3748     movl(limit, result);
3749     // Fallthru to tail compare
3750   } else if (UseSSE42Intrinsics) {
3751     // With SSE4.2, use double quad vector compare
3752     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3753 
3754     // Compare 16-byte vectors
3755     andl(result, 0x0000000f);  //   tail count (in bytes)
3756     andl(limit, 0xfffffff0);   // vector count (in bytes)
3757     jcc(Assembler::zero, COMPARE_TAIL);
3758 
3759     lea(ary1, Address(ary1, limit, Address::times_1));
3760     lea(ary2, Address(ary2, limit, Address::times_1));
3761     negptr(limit);
3762 
3763     bind(COMPARE_WIDE_VECTORS);
3764     movdqu(vec1, Address(ary1, limit, Address::times_1));
3765     movdqu(vec2, Address(ary2, limit, Address::times_1));
3766     pxor(vec1, vec2);
3767 
3768     ptest(vec1, vec1);
3769     jcc(Assembler::notZero, FALSE_LABEL);
3770     addptr(limit, 16);
3771     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3772 
3773     testl(result, result);
3774     jcc(Assembler::zero, TRUE_LABEL);
3775 
3776     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3777     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
3778     pxor(vec1, vec2);
3779 
3780     ptest(vec1, vec1);
3781     jccb(Assembler::notZero, FALSE_LABEL);
3782     jmpb(TRUE_LABEL);
3783 
3784     bind(COMPARE_TAIL); // limit is zero
3785     movl(limit, result);
3786     // Fallthru to tail compare
3787   }
3788 
3789   // Compare 4-byte vectors
3790   andl(limit, 0xfffffffc); // vector count (in bytes)
3791   jccb(Assembler::zero, COMPARE_CHAR);
3792 
3793   lea(ary1, Address(ary1, limit, Address::times_1));
3794   lea(ary2, Address(ary2, limit, Address::times_1));
3795   negptr(limit);
3796 
3797   bind(COMPARE_VECTORS);
3798   movl(chr, Address(ary1, limit, Address::times_1));
3799   cmpl(chr, Address(ary2, limit, Address::times_1));
3800   jccb(Assembler::notEqual, FALSE_LABEL);
3801   addptr(limit, 4);
3802   jcc(Assembler::notZero, COMPARE_VECTORS);
3803 
3804   // Compare trailing char (final 2 bytes), if any
3805   bind(COMPARE_CHAR);
3806   testl(result, 0x2);   // tail  char
3807   jccb(Assembler::zero, COMPARE_BYTE);
3808   load_unsigned_short(chr, Address(ary1, 0));
3809   load_unsigned_short(limit, Address(ary2, 0));
3810   cmpl(chr, limit);
3811   jccb(Assembler::notEqual, FALSE_LABEL);
3812 
3813   if (is_array_equ && is_char) {
3814     bind(COMPARE_BYTE);
3815   } else {
3816     lea(ary1, Address(ary1, 2));
3817     lea(ary2, Address(ary2, 2));
3818 
3819     bind(COMPARE_BYTE);
3820     testl(result, 0x1);   // tail  byte
3821     jccb(Assembler::zero, TRUE_LABEL);
3822     load_unsigned_byte(chr, Address(ary1, 0));
3823     load_unsigned_byte(limit, Address(ary2, 0));
3824     cmpl(chr, limit);
3825     jccb(Assembler::notEqual, FALSE_LABEL);
3826   }
3827   bind(TRUE_LABEL);
3828   movl(result, 1);   // return true
3829   jmpb(DONE);
3830 
3831   bind(FALSE_LABEL);
3832   xorl(result, result); // return false
3833 
3834   // That's it
3835   bind(DONE);
3836   if (UseAVX >= 2) {
3837     // clean upper bits of YMM registers
3838     vpxor(vec1, vec1);
3839     vpxor(vec2, vec2);
3840   }
3841 }
3842 
3843 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
3844                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
3845   switch(ideal_opc) {
3846     case Op_LShiftVS:
3847       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
3848     case Op_LShiftVI:
3849       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
3850     case Op_LShiftVL:
3851       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
3852     case Op_RShiftVS:
3853       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
3854     case Op_RShiftVI:
3855       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
3856     case Op_RShiftVL:
3857       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
3858     case Op_URShiftVS:
3859       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
3860     case Op_URShiftVI:
3861       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
3862     case Op_URShiftVL:
3863       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
3864     case Op_RotateRightV:
3865       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
3866     case Op_RotateLeftV:
3867       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
3868     default:
3869       fatal("Unsupported masked operation"); break;
3870   }
3871 }
3872 
3873 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
3874                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
3875                                     bool is_varshift) {
3876   switch (ideal_opc) {
3877     case Op_AddVB:
3878       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
3879     case Op_AddVS:
3880       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
3881     case Op_AddVI:
3882       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
3883     case Op_AddVL:
3884       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
3885     case Op_AddVF:
3886       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
3887     case Op_AddVD:
3888       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
3889     case Op_SubVB:
3890       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
3891     case Op_SubVS:
3892       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
3893     case Op_SubVI:
3894       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
3895     case Op_SubVL:
3896       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
3897     case Op_SubVF:
3898       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
3899     case Op_SubVD:
3900       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
3901     case Op_MulVS:
3902       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
3903     case Op_MulVI:
3904       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
3905     case Op_MulVL:
3906       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
3907     case Op_MulVF:
3908       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
3909     case Op_MulVD:
3910       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
3911     case Op_DivVF:
3912       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
3913     case Op_DivVD:
3914       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
3915     case Op_SqrtVF:
3916       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
3917     case Op_SqrtVD:
3918       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
3919     case Op_AbsVB:
3920       evpabsb(dst, mask, src2, merge, vlen_enc); break;
3921     case Op_AbsVS:
3922       evpabsw(dst, mask, src2, merge, vlen_enc); break;
3923     case Op_AbsVI:
3924       evpabsd(dst, mask, src2, merge, vlen_enc); break;
3925     case Op_AbsVL:
3926       evpabsq(dst, mask, src2, merge, vlen_enc); break;
3927     case Op_FmaVF:
3928       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
3929     case Op_FmaVD:
3930       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
3931     case Op_VectorRearrange:
3932       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
3933     case Op_LShiftVS:
3934       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3935     case Op_LShiftVI:
3936       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3937     case Op_LShiftVL:
3938       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3939     case Op_RShiftVS:
3940       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3941     case Op_RShiftVI:
3942       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3943     case Op_RShiftVL:
3944       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3945     case Op_URShiftVS:
3946       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3947     case Op_URShiftVI:
3948       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3949     case Op_URShiftVL:
3950       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3951     case Op_RotateLeftV:
3952       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
3953     case Op_RotateRightV:
3954       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
3955     case Op_MaxV:
3956       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
3957     case Op_MinV:
3958       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
3959     case Op_XorV:
3960       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
3961     case Op_OrV:
3962       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
3963     case Op_AndV:
3964       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
3965     default:
3966       fatal("Unsupported masked operation"); break;
3967   }
3968 }
3969 
3970 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
3971                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
3972   switch (ideal_opc) {
3973     case Op_AddVB:
3974       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
3975     case Op_AddVS:
3976       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
3977     case Op_AddVI:
3978       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
3979     case Op_AddVL:
3980       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
3981     case Op_AddVF:
3982       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
3983     case Op_AddVD:
3984       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
3985     case Op_SubVB:
3986       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
3987     case Op_SubVS:
3988       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
3989     case Op_SubVI:
3990       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
3991     case Op_SubVL:
3992       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
3993     case Op_SubVF:
3994       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
3995     case Op_SubVD:
3996       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
3997     case Op_MulVS:
3998       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
3999     case Op_MulVI:
4000       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4001     case Op_MulVL:
4002       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4003     case Op_MulVF:
4004       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4005     case Op_MulVD:
4006       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4007     case Op_DivVF:
4008       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4009     case Op_DivVD:
4010       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4011     case Op_FmaVF:
4012       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4013     case Op_FmaVD:
4014       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4015     case Op_MaxV:
4016       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4017     case Op_MinV:
4018       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4019     case Op_XorV:
4020       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4021     case Op_OrV:
4022       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4023     case Op_AndV:
4024       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4025     default:
4026       fatal("Unsupported masked operation"); break;
4027   }
4028 }
4029 
4030 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4031                                   KRegister src1, KRegister src2) {
4032   BasicType etype = T_ILLEGAL;
4033   switch(mask_len) {
4034     case 2:
4035     case 4:
4036     case 8:  etype = T_BYTE; break;
4037     case 16: etype = T_SHORT; break;
4038     case 32: etype = T_INT; break;
4039     case 64: etype = T_LONG; break;
4040     default: fatal("Unsupported type"); break;
4041   }
4042   assert(etype != T_ILLEGAL, "");
4043   switch(ideal_opc) {
4044     case Op_AndVMask:
4045       kand(etype, dst, src1, src2); break;
4046     case Op_OrVMask:
4047       kor(etype, dst, src1, src2); break;
4048     case Op_XorVMask:
4049       kxor(etype, dst, src1, src2); break;
4050     default:
4051       fatal("Unsupported masked operation"); break;
4052   }
4053 }
4054 
4055 #ifdef _LP64
4056 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask,
4057                                               Register tmp, int masklen, int masksize,
4058                                               int vec_enc) {
4059   if(VM_Version::supports_avx512bw()) {
4060     kmovql(tmp, mask);
4061   } else {
4062     assert(masklen <= 16, "");
4063     kmovwl(tmp, mask);
4064   }
4065   if (masksize < 16) {
4066     andq(tmp, (((jlong)1 << masklen) - 1));
4067   }
4068   switch(opc) {
4069     case Op_VectorMaskTrueCount:
4070       popcntq(dst, tmp);
4071       break;
4072     case Op_VectorMaskLastTrue:
4073       mov64(dst, -1);
4074       bsrq(tmp, tmp);
4075       cmov(Assembler::notZero, dst, tmp);
4076       break;
4077     case Op_VectorMaskFirstTrue:
4078       mov64(dst, masklen);
4079       bsfq(tmp, tmp);
4080       cmov(Assembler::notZero, dst, tmp);
4081       break;
4082     default: assert(false, "Unhandled mask operation");
4083   }
4084 }
4085 
4086 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
4087                                               XMMRegister xtmp1, Register tmp, int masklen, int masksize,
4088                                               int vec_enc) {
4089   assert(VM_Version::supports_avx(), "");
4090   vpxor(xtmp, xtmp, xtmp, vec_enc);
4091   vpsubb(xtmp, xtmp, mask, vec_enc);
4092   vpmovmskb(tmp, xtmp, vec_enc);
4093   if (masksize < 16) {
4094     andq(tmp, (((jlong)1 << masklen) - 1));
4095   }
4096   switch(opc) {
4097     case Op_VectorMaskTrueCount:
4098       popcntq(dst, tmp);
4099       break;
4100     case Op_VectorMaskLastTrue:
4101       mov64(dst, -1);
4102       bsrq(tmp, tmp);
4103       cmov(Assembler::notZero, dst, tmp);
4104       break;
4105     case Op_VectorMaskFirstTrue:
4106       mov64(dst, masklen);
4107       bsfq(tmp, tmp);
4108       cmov(Assembler::notZero, dst, tmp);
4109       break;
4110     default: assert(false, "Unhandled mask operation");
4111   }
4112 }
4113 #endif