1 /*
   2  * Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "oops/methodData.hpp"
  29 #include "opto/c2_MacroAssembler.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/opcodes.hpp"
  32 #include "opto/subnode.hpp"
  33 #include "runtime/objectMonitor.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 
  36 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
  37   switch (vlen_in_bytes) {
  38     case  4: // fall-through
  39     case  8: // fall-through
  40     case 16: return Assembler::AVX_128bit;
  41     case 32: return Assembler::AVX_256bit;
  42     case 64: return Assembler::AVX_512bit;
  43 
  44     default: {
  45       ShouldNotReachHere();
  46       return Assembler::AVX_NoVec;
  47     }
  48   }
  49 }
  50 
  51 void C2_MacroAssembler::setvectmask(Register dst, Register src, KRegister mask) {
  52   guarantee(PostLoopMultiversioning, "must be");
  53   Assembler::movl(dst, 1);
  54   Assembler::shlxl(dst, dst, src);
  55   Assembler::decl(dst);
  56   Assembler::kmovdl(mask, dst);
  57   Assembler::movl(dst, src);
  58 }
  59 
  60 void C2_MacroAssembler::restorevectmask(KRegister mask) {
  61   guarantee(PostLoopMultiversioning, "must be");
  62   Assembler::knotwl(mask, k0);
  63 }
  64 
  65 #if INCLUDE_RTM_OPT
  66 
  67 // Update rtm_counters based on abort status
  68 // input: abort_status
  69 //        rtm_counters (RTMLockingCounters*)
  70 // flags are killed
  71 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
  72 
  73   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
  74   if (PrintPreciseRTMLockingStatistics) {
  75     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
  76       Label check_abort;
  77       testl(abort_status, (1<<i));
  78       jccb(Assembler::equal, check_abort);
  79       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
  80       bind(check_abort);
  81     }
  82   }
  83 }
  84 
  85 // Branch if (random & (count-1) != 0), count is 2^n
  86 // tmp, scr and flags are killed
  87 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
  88   assert(tmp == rax, "");
  89   assert(scr == rdx, "");
  90   rdtsc(); // modifies EDX:EAX
  91   andptr(tmp, count-1);
  92   jccb(Assembler::notZero, brLabel);
  93 }
  94 
  95 // Perform abort ratio calculation, set no_rtm bit if high ratio
  96 // input:  rtm_counters_Reg (RTMLockingCounters* address)
  97 // tmpReg, rtm_counters_Reg and flags are killed
  98 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
  99                                                     Register rtm_counters_Reg,
 100                                                     RTMLockingCounters* rtm_counters,
 101                                                     Metadata* method_data) {
 102   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 103 
 104   if (RTMLockingCalculationDelay > 0) {
 105     // Delay calculation
 106     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
 107     testptr(tmpReg, tmpReg);
 108     jccb(Assembler::equal, L_done);
 109   }
 110   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 111   //   Aborted transactions = abort_count * 100
 112   //   All transactions = total_count *  RTMTotalCountIncrRate
 113   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 114 
 115   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 116   cmpptr(tmpReg, RTMAbortThreshold);
 117   jccb(Assembler::below, L_check_always_rtm2);
 118   imulptr(tmpReg, tmpReg, 100);
 119 
 120   Register scrReg = rtm_counters_Reg;
 121   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 122   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 123   imulptr(scrReg, scrReg, RTMAbortRatio);
 124   cmpptr(tmpReg, scrReg);
 125   jccb(Assembler::below, L_check_always_rtm1);
 126   if (method_data != NULL) {
 127     // set rtm_state to "no rtm" in MDO
 128     mov_metadata(tmpReg, method_data);
 129     lock();
 130     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
 131   }
 132   jmpb(L_done);
 133   bind(L_check_always_rtm1);
 134   // Reload RTMLockingCounters* address
 135   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 136   bind(L_check_always_rtm2);
 137   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 138   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 139   jccb(Assembler::below, L_done);
 140   if (method_data != NULL) {
 141     // set rtm_state to "always rtm" in MDO
 142     mov_metadata(tmpReg, method_data);
 143     lock();
 144     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
 145   }
 146   bind(L_done);
 147 }
 148 
 149 // Update counters and perform abort ratio calculation
 150 // input:  abort_status_Reg
 151 // rtm_counters_Reg, flags are killed
 152 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 153                                       Register rtm_counters_Reg,
 154                                       RTMLockingCounters* rtm_counters,
 155                                       Metadata* method_data,
 156                                       bool profile_rtm) {
 157 
 158   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 159   // update rtm counters based on rax value at abort
 160   // reads abort_status_Reg, updates flags
 161   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 162   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 163   if (profile_rtm) {
 164     // Save abort status because abort_status_Reg is used by following code.
 165     if (RTMRetryCount > 0) {
 166       push(abort_status_Reg);
 167     }
 168     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 169     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 170     // restore abort status
 171     if (RTMRetryCount > 0) {
 172       pop(abort_status_Reg);
 173     }
 174   }
 175 }
 176 
 177 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 178 // inputs: retry_count_Reg
 179 //       : abort_status_Reg
 180 // output: retry_count_Reg decremented by 1
 181 // flags are killed
 182 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 183   Label doneRetry;
 184   assert(abort_status_Reg == rax, "");
 185   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 186   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 187   // if reason is in 0x6 and retry count != 0 then retry
 188   andptr(abort_status_Reg, 0x6);
 189   jccb(Assembler::zero, doneRetry);
 190   testl(retry_count_Reg, retry_count_Reg);
 191   jccb(Assembler::zero, doneRetry);
 192   pause();
 193   decrementl(retry_count_Reg);
 194   jmp(retryLabel);
 195   bind(doneRetry);
 196 }
 197 
 198 // Spin and retry if lock is busy,
 199 // inputs: box_Reg (monitor address)
 200 //       : retry_count_Reg
 201 // output: retry_count_Reg decremented by 1
 202 //       : clear z flag if retry count exceeded
 203 // tmp_Reg, scr_Reg, flags are killed
 204 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 205                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 206   Label SpinLoop, SpinExit, doneRetry;
 207   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 208 
 209   testl(retry_count_Reg, retry_count_Reg);
 210   jccb(Assembler::zero, doneRetry);
 211   decrementl(retry_count_Reg);
 212   movptr(scr_Reg, RTMSpinLoopCount);
 213 
 214   bind(SpinLoop);
 215   pause();
 216   decrementl(scr_Reg);
 217   jccb(Assembler::lessEqual, SpinExit);
 218   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 219   testptr(tmp_Reg, tmp_Reg);
 220   jccb(Assembler::notZero, SpinLoop);
 221 
 222   bind(SpinExit);
 223   jmp(retryLabel);
 224   bind(doneRetry);
 225   incrementl(retry_count_Reg); // clear z flag
 226 }
 227 
 228 // Use RTM for normal stack locks
 229 // Input: objReg (object to lock)
 230 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 231                                          Register retry_on_abort_count_Reg,
 232                                          RTMLockingCounters* stack_rtm_counters,
 233                                          Metadata* method_data, bool profile_rtm,
 234                                          Label& DONE_LABEL, Label& IsInflated) {
 235   assert(UseRTMForStackLocks, "why call this otherwise?");
 236   assert(tmpReg == rax, "");
 237   assert(scrReg == rdx, "");
 238   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 239 
 240   if (RTMRetryCount > 0) {
 241     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 242     bind(L_rtm_retry);
 243   }
 244   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 245   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral
 246   jcc(Assembler::notZero, IsInflated);
 247 
 248   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 249     Label L_noincrement;
 250     if (RTMTotalCountIncrRate > 1) {
 251       // tmpReg, scrReg and flags are killed
 252       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 253     }
 254     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
 255     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 256     bind(L_noincrement);
 257   }
 258   xbegin(L_on_abort);
 259   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 260   andptr(tmpReg, markWord::lock_mask_in_place);     // look at 2 lock bits
 261   cmpptr(tmpReg, markWord::unlocked_value);         // bits = 01 unlocked
 262   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 263 
 264   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 265   if (UseRTMXendForLockBusy) {
 266     xend();
 267     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 268     jmp(L_decrement_retry);
 269   }
 270   else {
 271     xabort(0);
 272   }
 273   bind(L_on_abort);
 274   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 275     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 276   }
 277   bind(L_decrement_retry);
 278   if (RTMRetryCount > 0) {
 279     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 280     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 281   }
 282 }
 283 
 284 // Use RTM for inflating locks
 285 // inputs: objReg (object to lock)
 286 //         boxReg (on-stack box address (displaced header location) - KILLED)
 287 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 288 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 289                                             Register scrReg, Register retry_on_busy_count_Reg,
 290                                             Register retry_on_abort_count_Reg,
 291                                             RTMLockingCounters* rtm_counters,
 292                                             Metadata* method_data, bool profile_rtm,
 293                                             Label& DONE_LABEL) {
 294   assert(UseRTMLocking, "why call this otherwise?");
 295   assert(tmpReg == rax, "");
 296   assert(scrReg == rdx, "");
 297   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 298   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 299 
 300   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 301   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
 302   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 303 
 304   if (RTMRetryCount > 0) {
 305     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 306     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 307     bind(L_rtm_retry);
 308   }
 309   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 310     Label L_noincrement;
 311     if (RTMTotalCountIncrRate > 1) {
 312       // tmpReg, scrReg and flags are killed
 313       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 314     }
 315     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 316     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 317     bind(L_noincrement);
 318   }
 319   xbegin(L_on_abort);
 320   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 321   movptr(tmpReg, Address(tmpReg, owner_offset));
 322   testptr(tmpReg, tmpReg);
 323   jcc(Assembler::zero, DONE_LABEL);
 324   if (UseRTMXendForLockBusy) {
 325     xend();
 326     jmp(L_decrement_retry);
 327   }
 328   else {
 329     xabort(0);
 330   }
 331   bind(L_on_abort);
 332   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 333   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 334     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 335   }
 336   if (RTMRetryCount > 0) {
 337     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 338     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 339   }
 340 
 341   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 342   testptr(tmpReg, tmpReg) ;
 343   jccb(Assembler::notZero, L_decrement_retry) ;
 344 
 345   // Appears unlocked - try to swing _owner from null to non-null.
 346   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 347 #ifdef _LP64
 348   Register threadReg = r15_thread;
 349 #else
 350   get_thread(scrReg);
 351   Register threadReg = scrReg;
 352 #endif
 353   lock();
 354   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 355 
 356   if (RTMRetryCount > 0) {
 357     // success done else retry
 358     jccb(Assembler::equal, DONE_LABEL) ;
 359     bind(L_decrement_retry);
 360     // Spin and retry if lock is busy.
 361     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 362   }
 363   else {
 364     bind(L_decrement_retry);
 365   }
 366 }
 367 
 368 #endif //  INCLUDE_RTM_OPT
 369 
 370 // fast_lock and fast_unlock used by C2
 371 
 372 // Because the transitions from emitted code to the runtime
 373 // monitorenter/exit helper stubs are so slow it's critical that
 374 // we inline both the stack-locking fast path and the inflated fast path.
 375 //
 376 // See also: cmpFastLock and cmpFastUnlock.
 377 //
 378 // What follows is a specialized inline transliteration of the code
 379 // in enter() and exit(). If we're concerned about I$ bloat another
 380 // option would be to emit TrySlowEnter and TrySlowExit methods
 381 // at startup-time.  These methods would accept arguments as
 382 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 383 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 384 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 385 // In practice, however, the # of lock sites is bounded and is usually small.
 386 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 387 // if the processor uses simple bimodal branch predictors keyed by EIP
 388 // Since the helper routines would be called from multiple synchronization
 389 // sites.
 390 //
 391 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 392 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 393 // to those specialized methods.  That'd give us a mostly platform-independent
 394 // implementation that the JITs could optimize and inline at their pleasure.
 395 // Done correctly, the only time we'd need to cross to native could would be
 396 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 397 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 398 // (b) explicit barriers or fence operations.
 399 //
 400 // TODO:
 401 //
 402 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 403 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 404 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 405 //    the lock operators would typically be faster than reifying Self.
 406 //
 407 // *  Ideally I'd define the primitives as:
 408 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 409 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 410 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 411 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 412 //    Furthermore the register assignments are overconstrained, possibly resulting in
 413 //    sub-optimal code near the synchronization site.
 414 //
 415 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 416 //    Alternately, use a better sp-proximity test.
 417 //
 418 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 419 //    Either one is sufficient to uniquely identify a thread.
 420 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 421 //
 422 // *  Intrinsify notify() and notifyAll() for the common cases where the
 423 //    object is locked by the calling thread but the waitlist is empty.
 424 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 425 //
 426 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 427 //    But beware of excessive branch density on AMD Opterons.
 428 //
 429 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 430 //    or failure of the fast path.  If the fast path fails then we pass
 431 //    control to the slow path, typically in C.  In fast_lock and
 432 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 433 //    will emit a conditional branch immediately after the node.
 434 //    So we have branches to branches and lots of ICC.ZF games.
 435 //    Instead, it might be better to have C2 pass a "FailureLabel"
 436 //    into fast_lock and fast_unlock.  In the case of success, control
 437 //    will drop through the node.  ICC.ZF is undefined at exit.
 438 //    In the case of failure, the node will branch directly to the
 439 //    FailureLabel
 440 
 441 
 442 // obj: object to lock
 443 // box: on-stack box address (displaced header location) - KILLED
 444 // rax,: tmp -- KILLED
 445 // scr: tmp -- KILLED
 446 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 447                                  Register scrReg, Register cx1Reg, Register cx2Reg,
 448                                  RTMLockingCounters* rtm_counters,
 449                                  RTMLockingCounters* stack_rtm_counters,
 450                                  Metadata* method_data,
 451                                  bool use_rtm, bool profile_rtm) {
 452   // Ensure the register assignments are disjoint
 453   assert(tmpReg == rax, "");
 454 
 455   if (use_rtm) {
 456     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 457   } else {
 458     assert(cx2Reg == noreg, "");
 459     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 460   }
 461 
 462   // Possible cases that we'll encounter in fast_lock
 463   // ------------------------------------------------
 464   // * Inflated
 465   //    -- unlocked
 466   //    -- Locked
 467   //       = by self
 468   //       = by other
 469   // * neutral
 470   // * stack-locked
 471   //    -- by self
 472   //       = sp-proximity test hits
 473   //       = sp-proximity test generates false-negative
 474   //    -- by other
 475   //
 476 
 477   Label IsInflated, DONE_LABEL;
 478 
 479   if (DiagnoseSyncOnValueBasedClasses != 0) {
 480     load_klass(tmpReg, objReg, cx1Reg);
 481     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 482     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 483     jcc(Assembler::notZero, DONE_LABEL);
 484   }
 485 
 486 #if INCLUDE_RTM_OPT
 487   if (UseRTMForStackLocks && use_rtm) {
 488     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 489                       stack_rtm_counters, method_data, profile_rtm,
 490                       DONE_LABEL, IsInflated);
 491   }
 492 #endif // INCLUDE_RTM_OPT
 493 
 494   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 495   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 496   jccb(Assembler::notZero, IsInflated);
 497 
 498   // Attempt stack-locking ...
 499   orptr (tmpReg, markWord::unlocked_value);
 500   movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 501   lock();
 502   cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 503   jcc(Assembler::equal, DONE_LABEL);           // Success
 504 
 505   // Recursive locking.
 506   // The object is stack-locked: markword contains stack pointer to BasicLock.
 507   // Locked by current thread if difference with current SP is less than one page.
 508   subptr(tmpReg, rsp);
 509   // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 510   andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
 511   movptr(Address(boxReg, 0), tmpReg);
 512   jmp(DONE_LABEL);
 513 
 514   bind(IsInflated);
 515   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 516 
 517 #if INCLUDE_RTM_OPT
 518   // Use the same RTM locking code in 32- and 64-bit VM.
 519   if (use_rtm) {
 520     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 521                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 522   } else {
 523 #endif // INCLUDE_RTM_OPT
 524 
 525 #ifndef _LP64
 526   // The object is inflated.
 527 
 528   // boxReg refers to the on-stack BasicLock in the current frame.
 529   // We'd like to write:
 530   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 531   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 532   // additional latency as we have another ST in the store buffer that must drain.
 533 
 534   // avoid ST-before-CAS
 535   // register juggle because we need tmpReg for cmpxchgptr below
 536   movptr(scrReg, boxReg);
 537   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 538 
 539   // Optimistic form: consider XORL tmpReg,tmpReg
 540   movptr(tmpReg, NULL_WORD);
 541 
 542   // Appears unlocked - try to swing _owner from null to non-null.
 543   // Ideally, I'd manifest "Self" with get_thread and then attempt
 544   // to CAS the register containing Self into m->Owner.
 545   // But we don't have enough registers, so instead we can either try to CAS
 546   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 547   // we later store "Self" into m->Owner.  Transiently storing a stack address
 548   // (rsp or the address of the box) into  m->owner is harmless.
 549   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 550   lock();
 551   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 552   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 553   // If we weren't able to swing _owner from NULL to the BasicLock
 554   // then take the slow path.
 555   jccb  (Assembler::notZero, DONE_LABEL);
 556   // update _owner from BasicLock to thread
 557   get_thread (scrReg);                    // beware: clobbers ICCs
 558   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 559   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 560 
 561   // If the CAS fails we can either retry or pass control to the slow path.
 562   // We use the latter tactic.
 563   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 564   // If the CAS was successful ...
 565   //   Self has acquired the lock
 566   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 567   // Intentional fall-through into DONE_LABEL ...
 568 #else // _LP64
 569   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 570   movq(scrReg, tmpReg);
 571   xorq(tmpReg, tmpReg);
 572   lock();
 573   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 574   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 575   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 576   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
 577   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 578   jcc(Assembler::equal, DONE_LABEL);           // CAS above succeeded; propagate ZF = 1 (success)
 579 
 580   cmpptr(r15_thread, rax);                     // Check if we are already the owner (recursive lock)
 581   jcc(Assembler::notEqual, DONE_LABEL);        // If not recursive, ZF = 0 at this point (fail)
 582   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 583   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 584 #endif // _LP64
 585 #if INCLUDE_RTM_OPT
 586   } // use_rtm()
 587 #endif
 588   // DONE_LABEL is a hot target - we'd really like to place it at the
 589   // start of cache line by padding with NOPs.
 590   // See the AMD and Intel software optimization manuals for the
 591   // most efficient "long" NOP encodings.
 592   // Unfortunately none of our alignment mechanisms suffice.
 593   bind(DONE_LABEL);
 594 
 595   // At DONE_LABEL the icc ZFlag is set as follows ...
 596   // fast_unlock uses the same protocol.
 597   // ZFlag == 1 -> Success
 598   // ZFlag == 0 -> Failure - force control through the slow path
 599 }
 600 
 601 // obj: object to unlock
 602 // box: box address (displaced header location), killed.  Must be EAX.
 603 // tmp: killed, cannot be obj nor box.
 604 //
 605 // Some commentary on balanced locking:
 606 //
 607 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 608 // Methods that don't have provably balanced locking are forced to run in the
 609 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 610 // The interpreter provides two properties:
 611 // I1:  At return-time the interpreter automatically and quietly unlocks any
 612 //      objects acquired the current activation (frame).  Recall that the
 613 //      interpreter maintains an on-stack list of locks currently held by
 614 //      a frame.
 615 // I2:  If a method attempts to unlock an object that is not held by the
 616 //      the frame the interpreter throws IMSX.
 617 //
 618 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 619 // B() doesn't have provably balanced locking so it runs in the interpreter.
 620 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 621 // is still locked by A().
 622 //
 623 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 624 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 625 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 626 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 627 // Arguably given that the spec legislates the JNI case as undefined our implementation
 628 // could reasonably *avoid* checking owner in fast_unlock().
 629 // In the interest of performance we elide m->Owner==Self check in unlock.
 630 // A perfectly viable alternative is to elide the owner check except when
 631 // Xcheck:jni is enabled.
 632 
 633 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 634   assert(boxReg == rax, "");
 635   assert_different_registers(objReg, boxReg, tmpReg);
 636 
 637   Label DONE_LABEL, Stacked, CheckSucc;
 638 
 639 #if INCLUDE_RTM_OPT
 640   if (UseRTMForStackLocks && use_rtm) {
 641     Label L_regular_unlock;
 642     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 643     andptr(tmpReg, markWord::lock_mask_in_place);                     // look at 2 lock bits
 644     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 01 unlocked
 645     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 646     xend();                                                           // otherwise end...
 647     jmp(DONE_LABEL);                                                  // ... and we're done
 648     bind(L_regular_unlock);
 649   }
 650 #endif
 651 
 652   cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD);                   // Examine the displaced header
 653   jcc   (Assembler::zero, DONE_LABEL);                              // 0 indicates recursive stack-lock
 654   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
 655   testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 656   jccb  (Assembler::zero, Stacked);
 657 
 658   // It's inflated.
 659 #if INCLUDE_RTM_OPT
 660   if (use_rtm) {
 661     Label L_regular_inflated_unlock;
 662     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 663     movptr(boxReg, Address(tmpReg, owner_offset));
 664     testptr(boxReg, boxReg);
 665     jccb(Assembler::notZero, L_regular_inflated_unlock);
 666     xend();
 667     jmpb(DONE_LABEL);
 668     bind(L_regular_inflated_unlock);
 669   }
 670 #endif
 671 
 672   // Despite our balanced locking property we still check that m->_owner == Self
 673   // as java routines or native JNI code called by this thread might
 674   // have released the lock.
 675   // Refer to the comments in synchronizer.cpp for how we might encode extra
 676   // state in _succ so we can avoid fetching EntryList|cxq.
 677   //
 678   // If there's no contention try a 1-0 exit.  That is, exit without
 679   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 680   // we detect and recover from the race that the 1-0 exit admits.
 681   //
 682   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 683   // before it STs null into _owner, releasing the lock.  Updates
 684   // to data protected by the critical section must be visible before
 685   // we drop the lock (and thus before any other thread could acquire
 686   // the lock and observe the fields protected by the lock).
 687   // IA32's memory-model is SPO, so STs are ordered with respect to
 688   // each other and there's no need for an explicit barrier (fence).
 689   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 690 #ifndef _LP64
 691   get_thread (boxReg);
 692 
 693   // Note that we could employ various encoding schemes to reduce
 694   // the number of loads below (currently 4) to just 2 or 3.
 695   // Refer to the comments in synchronizer.cpp.
 696   // In practice the chain of fetches doesn't seem to impact performance, however.
 697   xorptr(boxReg, boxReg);
 698   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 699   jccb  (Assembler::notZero, DONE_LABEL);
 700   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 701   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 702   jccb  (Assembler::notZero, CheckSucc);
 703   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 704   jmpb  (DONE_LABEL);
 705 
 706   bind (Stacked);
 707   // It's not inflated and it's not recursively stack-locked.
 708   // It must be stack-locked.
 709   // Try to reset the header to displaced header.
 710   // The "box" value on the stack is stable, so we can reload
 711   // and be assured we observe the same value as above.
 712   movptr(tmpReg, Address(boxReg, 0));
 713   lock();
 714   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 715   // Intention fall-thru into DONE_LABEL
 716 
 717   // DONE_LABEL is a hot target - we'd really like to place it at the
 718   // start of cache line by padding with NOPs.
 719   // See the AMD and Intel software optimization manuals for the
 720   // most efficient "long" NOP encodings.
 721   // Unfortunately none of our alignment mechanisms suffice.
 722   bind (CheckSucc);
 723 #else // _LP64
 724   // It's inflated
 725   Label LNotRecursive, LSuccess, LGoSlowPath;
 726 
 727   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 728   jccb(Assembler::equal, LNotRecursive);
 729 
 730   // Recursive inflated unlock
 731   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 732   jmpb(LSuccess);
 733 
 734   bind(LNotRecursive);
 735   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 736   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 737   jccb  (Assembler::notZero, CheckSucc);
 738   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 739   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 740   jmpb  (DONE_LABEL);
 741 
 742   // Try to avoid passing control into the slow_path ...
 743   bind  (CheckSucc);
 744 
 745   // The following optional optimization can be elided if necessary
 746   // Effectively: if (succ == null) goto slow path
 747   // The code reduces the window for a race, however,
 748   // and thus benefits performance.
 749   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 750   jccb  (Assembler::zero, LGoSlowPath);
 751 
 752   xorptr(boxReg, boxReg);
 753   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 754   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 755 
 756   // Memory barrier/fence
 757   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 758   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 759   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 760   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 761   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 762   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 763   lock(); addl(Address(rsp, 0), 0);
 764 
 765   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 766   jccb  (Assembler::notZero, LSuccess);
 767 
 768   // Rare inopportune interleaving - race.
 769   // The successor vanished in the small window above.
 770   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 771   // We need to ensure progress and succession.
 772   // Try to reacquire the lock.
 773   // If that fails then the new owner is responsible for succession and this
 774   // thread needs to take no further action and can exit via the fast path (success).
 775   // If the re-acquire succeeds then pass control into the slow path.
 776   // As implemented, this latter mode is horrible because we generated more
 777   // coherence traffic on the lock *and* artifically extended the critical section
 778   // length while by virtue of passing control into the slow path.
 779 
 780   // box is really RAX -- the following CMPXCHG depends on that binding
 781   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 782   lock();
 783   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 784   // There's no successor so we tried to regrab the lock.
 785   // If that didn't work, then another thread grabbed the
 786   // lock so we're done (and exit was a success).
 787   jccb  (Assembler::notEqual, LSuccess);
 788   // Intentional fall-through into slow path
 789 
 790   bind  (LGoSlowPath);
 791   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 792   jmpb  (DONE_LABEL);
 793 
 794   bind  (LSuccess);
 795   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 796   jmpb  (DONE_LABEL);
 797 
 798   bind  (Stacked);
 799   movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 800   lock();
 801   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 802 
 803 #endif
 804   bind(DONE_LABEL);
 805 }
 806 
 807 //-------------------------------------------------------------------------------------------
 808 // Generic instructions support for use in .ad files C2 code generation
 809 
 810 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
 811   if (dst != src) {
 812     movdqu(dst, src);
 813   }
 814   if (opcode == Op_AbsVD) {
 815     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr);
 816   } else {
 817     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 818     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr);
 819   }
 820 }
 821 
 822 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
 823   if (opcode == Op_AbsVD) {
 824     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr);
 825   } else {
 826     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 827     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr);
 828   }
 829 }
 830 
 831 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
 832   if (dst != src) {
 833     movdqu(dst, src);
 834   }
 835   if (opcode == Op_AbsVF) {
 836     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);
 837   } else {
 838     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 839     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);
 840   }
 841 }
 842 
 843 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
 844   if (opcode == Op_AbsVF) {
 845     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);
 846   } else {
 847     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 848     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
 849   }
 850 }
 851 
 852 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 853   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 854   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 855 
 856   if (opcode == Op_MinV) {
 857     if (elem_bt == T_BYTE) {
 858       pminsb(dst, src);
 859     } else if (elem_bt == T_SHORT) {
 860       pminsw(dst, src);
 861     } else if (elem_bt == T_INT) {
 862       pminsd(dst, src);
 863     } else {
 864       assert(elem_bt == T_LONG, "required");
 865       assert(tmp == xmm0, "required");
 866       assert_different_registers(dst, src, tmp);
 867       movdqu(xmm0, dst);
 868       pcmpgtq(xmm0, src);
 869       blendvpd(dst, src);  // xmm0 as mask
 870     }
 871   } else { // opcode == Op_MaxV
 872     if (elem_bt == T_BYTE) {
 873       pmaxsb(dst, src);
 874     } else if (elem_bt == T_SHORT) {
 875       pmaxsw(dst, src);
 876     } else if (elem_bt == T_INT) {
 877       pmaxsd(dst, src);
 878     } else {
 879       assert(elem_bt == T_LONG, "required");
 880       assert(tmp == xmm0, "required");
 881       assert_different_registers(dst, src, tmp);
 882       movdqu(xmm0, src);
 883       pcmpgtq(xmm0, dst);
 884       blendvpd(dst, src);  // xmm0 as mask
 885     }
 886   }
 887 }
 888 
 889 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 890                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 891                                  int vlen_enc) {
 892   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 893 
 894   if (opcode == Op_MinV) {
 895     if (elem_bt == T_BYTE) {
 896       vpminsb(dst, src1, src2, vlen_enc);
 897     } else if (elem_bt == T_SHORT) {
 898       vpminsw(dst, src1, src2, vlen_enc);
 899     } else if (elem_bt == T_INT) {
 900       vpminsd(dst, src1, src2, vlen_enc);
 901     } else {
 902       assert(elem_bt == T_LONG, "required");
 903       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 904         vpminsq(dst, src1, src2, vlen_enc);
 905       } else {
 906         assert_different_registers(dst, src1, src2);
 907         vpcmpgtq(dst, src1, src2, vlen_enc);
 908         vblendvpd(dst, src1, src2, dst, vlen_enc);
 909       }
 910     }
 911   } else { // opcode == Op_MaxV
 912     if (elem_bt == T_BYTE) {
 913       vpmaxsb(dst, src1, src2, vlen_enc);
 914     } else if (elem_bt == T_SHORT) {
 915       vpmaxsw(dst, src1, src2, vlen_enc);
 916     } else if (elem_bt == T_INT) {
 917       vpmaxsd(dst, src1, src2, vlen_enc);
 918     } else {
 919       assert(elem_bt == T_LONG, "required");
 920       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 921         vpmaxsq(dst, src1, src2, vlen_enc);
 922       } else {
 923         assert_different_registers(dst, src1, src2);
 924         vpcmpgtq(dst, src1, src2, vlen_enc);
 925         vblendvpd(dst, src2, src1, dst, vlen_enc);
 926       }
 927     }
 928   }
 929 }
 930 
 931 // Float/Double min max
 932 
 933 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 934                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 935                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 936                                    int vlen_enc) {
 937   assert(UseAVX > 0, "required");
 938   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 939          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 940   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 941   assert_different_registers(a, b, tmp, atmp, btmp);
 942 
 943   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 944   bool is_double_word = is_double_word_type(elem_bt);
 945 
 946   if (!is_double_word && is_min) {
 947     vblendvps(atmp, a, b, a, vlen_enc);
 948     vblendvps(btmp, b, a, a, vlen_enc);
 949     vminps(tmp, atmp, btmp, vlen_enc);
 950     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 951     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
 952   } else if (!is_double_word && !is_min) {
 953     vblendvps(btmp, b, a, b, vlen_enc);
 954     vblendvps(atmp, a, b, b, vlen_enc);
 955     vmaxps(tmp, atmp, btmp, vlen_enc);
 956     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 957     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
 958   } else if (is_double_word && is_min) {
 959     vblendvpd(atmp, a, b, a, vlen_enc);
 960     vblendvpd(btmp, b, a, a, vlen_enc);
 961     vminpd(tmp, atmp, btmp, vlen_enc);
 962     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 963     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
 964   } else {
 965     assert(is_double_word && !is_min, "sanity");
 966     vblendvpd(btmp, b, a, b, vlen_enc);
 967     vblendvpd(atmp, a, b, b, vlen_enc);
 968     vmaxpd(tmp, atmp, btmp, vlen_enc);
 969     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 970     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
 971   }
 972 }
 973 
 974 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
 975                                     XMMRegister dst, XMMRegister a, XMMRegister b,
 976                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
 977                                     int vlen_enc) {
 978   assert(UseAVX > 2, "required");
 979   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 980          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 981   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 982   assert_different_registers(dst, a, b, atmp, btmp);
 983 
 984   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 985   bool is_double_word = is_double_word_type(elem_bt);
 986   bool merge = true;
 987 
 988   if (!is_double_word && is_min) {
 989     evpmovd2m(ktmp, a, vlen_enc);
 990     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
 991     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
 992     vminps(dst, atmp, btmp, vlen_enc);
 993     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 994     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
 995   } else if (!is_double_word && !is_min) {
 996     evpmovd2m(ktmp, b, vlen_enc);
 997     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
 998     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
 999     vmaxps(dst, atmp, btmp, vlen_enc);
1000     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1001     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1002   } else if (is_double_word && is_min) {
1003     evpmovq2m(ktmp, a, vlen_enc);
1004     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1005     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1006     vminpd(dst, atmp, btmp, vlen_enc);
1007     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1008     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1009   } else {
1010     assert(is_double_word && !is_min, "sanity");
1011     evpmovq2m(ktmp, b, vlen_enc);
1012     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1013     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1014     vmaxpd(dst, atmp, btmp, vlen_enc);
1015     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1016     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1017   }
1018 }
1019 
1020 // Float/Double signum
1021 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst,
1022                                   XMMRegister zero, XMMRegister one,
1023                                   Register scratch) {
1024   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1025 
1026   Label DONE_LABEL;
1027 
1028   if (opcode == Op_SignumF) {
1029     assert(UseSSE > 0, "required");
1030     ucomiss(dst, zero);
1031     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1032     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1033     movflt(dst, one);
1034     jcc(Assembler::above, DONE_LABEL);
1035     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scratch);
1036   } else if (opcode == Op_SignumD) {
1037     assert(UseSSE > 1, "required");
1038     ucomisd(dst, zero);
1039     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1040     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1041     movdbl(dst, one);
1042     jcc(Assembler::above, DONE_LABEL);
1043     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scratch);
1044   }
1045 
1046   bind(DONE_LABEL);
1047 }
1048 
1049 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1050   if (sign) {
1051     pmovsxbw(dst, src);
1052   } else {
1053     pmovzxbw(dst, src);
1054   }
1055 }
1056 
1057 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1058   if (sign) {
1059     vpmovsxbw(dst, src, vector_len);
1060   } else {
1061     vpmovzxbw(dst, src, vector_len);
1062   }
1063 }
1064 
1065 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1066   if (sign) {
1067     vpmovsxbd(dst, src, vector_len);
1068   } else {
1069     vpmovzxbd(dst, src, vector_len);
1070   }
1071 }
1072 
1073 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1074   if (sign) {
1075     vpmovsxwd(dst, src, vector_len);
1076   } else {
1077     vpmovzxwd(dst, src, vector_len);
1078   }
1079 }
1080 
1081 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1082                                      int shift, int vector_len) {
1083   if (opcode == Op_RotateLeftV) {
1084     if (etype == T_INT) {
1085       evprold(dst, src, shift, vector_len);
1086     } else {
1087       assert(etype == T_LONG, "expected type T_LONG");
1088       evprolq(dst, src, shift, vector_len);
1089     }
1090   } else {
1091     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1092     if (etype == T_INT) {
1093       evprord(dst, src, shift, vector_len);
1094     } else {
1095       assert(etype == T_LONG, "expected type T_LONG");
1096       evprorq(dst, src, shift, vector_len);
1097     }
1098   }
1099 }
1100 
1101 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1102                                      XMMRegister shift, int vector_len) {
1103   if (opcode == Op_RotateLeftV) {
1104     if (etype == T_INT) {
1105       evprolvd(dst, src, shift, vector_len);
1106     } else {
1107       assert(etype == T_LONG, "expected type T_LONG");
1108       evprolvq(dst, src, shift, vector_len);
1109     }
1110   } else {
1111     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1112     if (etype == T_INT) {
1113       evprorvd(dst, src, shift, vector_len);
1114     } else {
1115       assert(etype == T_LONG, "expected type T_LONG");
1116       evprorvq(dst, src, shift, vector_len);
1117     }
1118   }
1119 }
1120 
1121 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1122   if (opcode == Op_RShiftVI) {
1123     psrad(dst, shift);
1124   } else if (opcode == Op_LShiftVI) {
1125     pslld(dst, shift);
1126   } else {
1127     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1128     psrld(dst, shift);
1129   }
1130 }
1131 
1132 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1133   switch (opcode) {
1134     case Op_RShiftVI:  psrad(dst, shift); break;
1135     case Op_LShiftVI:  pslld(dst, shift); break;
1136     case Op_URShiftVI: psrld(dst, shift); break;
1137 
1138     default: assert(false, "%s", NodeClassNames[opcode]);
1139   }
1140 }
1141 
1142 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1143   if (opcode == Op_RShiftVI) {
1144     vpsrad(dst, nds, shift, vector_len);
1145   } else if (opcode == Op_LShiftVI) {
1146     vpslld(dst, nds, shift, vector_len);
1147   } else {
1148     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1149     vpsrld(dst, nds, shift, vector_len);
1150   }
1151 }
1152 
1153 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1154   switch (opcode) {
1155     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1156     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1157     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1158 
1159     default: assert(false, "%s", NodeClassNames[opcode]);
1160   }
1161 }
1162 
1163 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1164   switch (opcode) {
1165     case Op_RShiftVB:  // fall-through
1166     case Op_RShiftVS:  psraw(dst, shift); break;
1167 
1168     case Op_LShiftVB:  // fall-through
1169     case Op_LShiftVS:  psllw(dst, shift);   break;
1170 
1171     case Op_URShiftVS: // fall-through
1172     case Op_URShiftVB: psrlw(dst, shift);  break;
1173 
1174     default: assert(false, "%s", NodeClassNames[opcode]);
1175   }
1176 }
1177 
1178 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1179   switch (opcode) {
1180     case Op_RShiftVB:  // fall-through
1181     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1182 
1183     case Op_LShiftVB:  // fall-through
1184     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1185 
1186     case Op_URShiftVS: // fall-through
1187     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1188 
1189     default: assert(false, "%s", NodeClassNames[opcode]);
1190   }
1191 }
1192 
1193 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1194   switch (opcode) {
1195     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1196     case Op_LShiftVL:  psllq(dst, shift); break;
1197     case Op_URShiftVL: psrlq(dst, shift); break;
1198 
1199     default: assert(false, "%s", NodeClassNames[opcode]);
1200   }
1201 }
1202 
1203 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1204   if (opcode == Op_RShiftVL) {
1205     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1206   } else if (opcode == Op_LShiftVL) {
1207     psllq(dst, shift);
1208   } else {
1209     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1210     psrlq(dst, shift);
1211   }
1212 }
1213 
1214 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1215   switch (opcode) {
1216     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1217     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1218     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1219 
1220     default: assert(false, "%s", NodeClassNames[opcode]);
1221   }
1222 }
1223 
1224 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1225   if (opcode == Op_RShiftVL) {
1226     evpsraq(dst, nds, shift, vector_len);
1227   } else if (opcode == Op_LShiftVL) {
1228     vpsllq(dst, nds, shift, vector_len);
1229   } else {
1230     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1231     vpsrlq(dst, nds, shift, vector_len);
1232   }
1233 }
1234 
1235 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1236   switch (opcode) {
1237     case Op_RShiftVB:  // fall-through
1238     case Op_RShiftVS:  // fall-through
1239     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1240 
1241     case Op_LShiftVB:  // fall-through
1242     case Op_LShiftVS:  // fall-through
1243     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1244 
1245     case Op_URShiftVB: // fall-through
1246     case Op_URShiftVS: // fall-through
1247     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1248 
1249     default: assert(false, "%s", NodeClassNames[opcode]);
1250   }
1251 }
1252 
1253 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1254   switch (opcode) {
1255     case Op_RShiftVB:  // fall-through
1256     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1257 
1258     case Op_LShiftVB:  // fall-through
1259     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1260 
1261     case Op_URShiftVB: // fall-through
1262     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1263 
1264     default: assert(false, "%s", NodeClassNames[opcode]);
1265   }
1266 }
1267 
1268 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1269   assert(UseAVX >= 2, "required");
1270   switch (opcode) {
1271     case Op_RShiftVL: {
1272       if (UseAVX > 2) {
1273         assert(tmp == xnoreg, "not used");
1274         if (!VM_Version::supports_avx512vl()) {
1275           vlen_enc = Assembler::AVX_512bit;
1276         }
1277         evpsravq(dst, src, shift, vlen_enc);
1278       } else {
1279         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1280         vpsrlvq(dst, src, shift, vlen_enc);
1281         vpsrlvq(tmp, tmp, shift, vlen_enc);
1282         vpxor(dst, dst, tmp, vlen_enc);
1283         vpsubq(dst, dst, tmp, vlen_enc);
1284       }
1285       break;
1286     }
1287     case Op_LShiftVL: {
1288       assert(tmp == xnoreg, "not used");
1289       vpsllvq(dst, src, shift, vlen_enc);
1290       break;
1291     }
1292     case Op_URShiftVL: {
1293       assert(tmp == xnoreg, "not used");
1294       vpsrlvq(dst, src, shift, vlen_enc);
1295       break;
1296     }
1297     default: assert(false, "%s", NodeClassNames[opcode]);
1298   }
1299 }
1300 
1301 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1302 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1303   assert(opcode == Op_LShiftVB ||
1304          opcode == Op_RShiftVB ||
1305          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1306   bool sign = (opcode != Op_URShiftVB);
1307   assert(vector_len == 0, "required");
1308   vextendbd(sign, dst, src, 1);
1309   vpmovzxbd(vtmp, shift, 1);
1310   varshiftd(opcode, dst, dst, vtmp, 1);
1311   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch);
1312   vextracti128_high(vtmp, dst);
1313   vpackusdw(dst, dst, vtmp, 0);
1314 }
1315 
1316 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1317 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1318   assert(opcode == Op_LShiftVB ||
1319          opcode == Op_RShiftVB ||
1320          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1321   bool sign = (opcode != Op_URShiftVB);
1322   int ext_vector_len = vector_len + 1;
1323   vextendbw(sign, dst, src, ext_vector_len);
1324   vpmovzxbw(vtmp, shift, ext_vector_len);
1325   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1326   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch);
1327   if (vector_len == 0) {
1328     vextracti128_high(vtmp, dst);
1329     vpackuswb(dst, dst, vtmp, vector_len);
1330   } else {
1331     vextracti64x4_high(vtmp, dst);
1332     vpackuswb(dst, dst, vtmp, vector_len);
1333     vpermq(dst, dst, 0xD8, vector_len);
1334   }
1335 }
1336 
1337 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1338   switch(typ) {
1339     case T_BYTE:
1340       pinsrb(dst, val, idx);
1341       break;
1342     case T_SHORT:
1343       pinsrw(dst, val, idx);
1344       break;
1345     case T_INT:
1346       pinsrd(dst, val, idx);
1347       break;
1348     case T_LONG:
1349       pinsrq(dst, val, idx);
1350       break;
1351     default:
1352       assert(false,"Should not reach here.");
1353       break;
1354   }
1355 }
1356 
1357 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1358   switch(typ) {
1359     case T_BYTE:
1360       vpinsrb(dst, src, val, idx);
1361       break;
1362     case T_SHORT:
1363       vpinsrw(dst, src, val, idx);
1364       break;
1365     case T_INT:
1366       vpinsrd(dst, src, val, idx);
1367       break;
1368     case T_LONG:
1369       vpinsrq(dst, src, val, idx);
1370       break;
1371     default:
1372       assert(false,"Should not reach here.");
1373       break;
1374   }
1375 }
1376 
1377 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1378   switch(typ) {
1379     case T_INT:
1380       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1381       break;
1382     case T_FLOAT:
1383       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1384       break;
1385     case T_LONG:
1386       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1387       break;
1388     case T_DOUBLE:
1389       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1390       break;
1391     default:
1392       assert(false,"Should not reach here.");
1393       break;
1394   }
1395 }
1396 
1397 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1398   switch(typ) {
1399     case T_INT:
1400       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1401       break;
1402     case T_FLOAT:
1403       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1404       break;
1405     case T_LONG:
1406       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1407       break;
1408     case T_DOUBLE:
1409       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1410       break;
1411     default:
1412       assert(false,"Should not reach here.");
1413       break;
1414   }
1415 }
1416 
1417 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1418   switch(typ) {
1419     case T_INT:
1420       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1421       break;
1422     case T_FLOAT:
1423       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1424       break;
1425     case T_LONG:
1426       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1427       break;
1428     case T_DOUBLE:
1429       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1430       break;
1431     default:
1432       assert(false,"Should not reach here.");
1433       break;
1434   }
1435 }
1436 
1437 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1438   if (vlen_in_bytes <= 16) {
1439     pxor (dst, dst);
1440     psubb(dst, src);
1441     switch (elem_bt) {
1442       case T_BYTE:   /* nothing to do */ break;
1443       case T_SHORT:  pmovsxbw(dst, dst); break;
1444       case T_INT:    pmovsxbd(dst, dst); break;
1445       case T_FLOAT:  pmovsxbd(dst, dst); break;
1446       case T_LONG:   pmovsxbq(dst, dst); break;
1447       case T_DOUBLE: pmovsxbq(dst, dst); break;
1448 
1449       default: assert(false, "%s", type2name(elem_bt));
1450     }
1451   } else {
1452     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1453     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1454 
1455     vpxor (dst, dst, dst, vlen_enc);
1456     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1457 
1458     switch (elem_bt) {
1459       case T_BYTE:   /* nothing to do */            break;
1460       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1461       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1462       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1463       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1464       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1465 
1466       default: assert(false, "%s", type2name(elem_bt));
1467     }
1468   }
1469 }
1470 
1471 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp,
1472                                          Register tmp, bool novlbwdq, int vlen_enc) {
1473   if (novlbwdq) {
1474     vpmovsxbd(xtmp, src, vlen_enc);
1475     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1476             Assembler::eq, true, vlen_enc, tmp);
1477   } else {
1478     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1479     vpsubb(xtmp, xtmp, src, vlen_enc);
1480     evpmovb2m(dst, xtmp, vlen_enc);
1481   }
1482 }
1483 
1484 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) {
1485   ExternalAddress addr(StubRoutines::x86::vector_iota_indices());
1486   if (vlen_in_bytes == 4) {
1487     movdl(dst, addr);
1488   } else if (vlen_in_bytes == 8) {
1489     movq(dst, addr);
1490   } else if (vlen_in_bytes == 16) {
1491     movdqu(dst, addr, scratch);
1492   } else if (vlen_in_bytes == 32) {
1493     vmovdqu(dst, addr, scratch);
1494   } else {
1495     assert(vlen_in_bytes == 64, "%d", vlen_in_bytes);
1496     evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch);
1497   }
1498 }
1499 
1500 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1501 
1502 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1503   int vector_len = Assembler::AVX_128bit;
1504 
1505   switch (opcode) {
1506     case Op_AndReductionV:  pand(dst, src); break;
1507     case Op_OrReductionV:   por (dst, src); break;
1508     case Op_XorReductionV:  pxor(dst, src); break;
1509     case Op_MinReductionV:
1510       switch (typ) {
1511         case T_BYTE:        pminsb(dst, src); break;
1512         case T_SHORT:       pminsw(dst, src); break;
1513         case T_INT:         pminsd(dst, src); break;
1514         case T_LONG:        assert(UseAVX > 2, "required");
1515                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1516         default:            assert(false, "wrong type");
1517       }
1518       break;
1519     case Op_MaxReductionV:
1520       switch (typ) {
1521         case T_BYTE:        pmaxsb(dst, src); break;
1522         case T_SHORT:       pmaxsw(dst, src); break;
1523         case T_INT:         pmaxsd(dst, src); break;
1524         case T_LONG:        assert(UseAVX > 2, "required");
1525                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1526         default:            assert(false, "wrong type");
1527       }
1528       break;
1529     case Op_AddReductionVF: addss(dst, src); break;
1530     case Op_AddReductionVD: addsd(dst, src); break;
1531     case Op_AddReductionVI:
1532       switch (typ) {
1533         case T_BYTE:        paddb(dst, src); break;
1534         case T_SHORT:       paddw(dst, src); break;
1535         case T_INT:         paddd(dst, src); break;
1536         default:            assert(false, "wrong type");
1537       }
1538       break;
1539     case Op_AddReductionVL: paddq(dst, src); break;
1540     case Op_MulReductionVF: mulss(dst, src); break;
1541     case Op_MulReductionVD: mulsd(dst, src); break;
1542     case Op_MulReductionVI:
1543       switch (typ) {
1544         case T_SHORT:       pmullw(dst, src); break;
1545         case T_INT:         pmulld(dst, src); break;
1546         default:            assert(false, "wrong type");
1547       }
1548       break;
1549     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1550                             vpmullq(dst, dst, src, vector_len); break;
1551     default:                assert(false, "wrong opcode");
1552   }
1553 }
1554 
1555 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1556   int vector_len = Assembler::AVX_256bit;
1557 
1558   switch (opcode) {
1559     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1560     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1561     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1562     case Op_MinReductionV:
1563       switch (typ) {
1564         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1565         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1566         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1567         case T_LONG:        assert(UseAVX > 2, "required");
1568                             vpminsq(dst, src1, src2, vector_len); break;
1569         default:            assert(false, "wrong type");
1570       }
1571       break;
1572     case Op_MaxReductionV:
1573       switch (typ) {
1574         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1575         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1576         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1577         case T_LONG:        assert(UseAVX > 2, "required");
1578                             vpmaxsq(dst, src1, src2, vector_len); break;
1579         default:            assert(false, "wrong type");
1580       }
1581       break;
1582     case Op_AddReductionVI:
1583       switch (typ) {
1584         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1585         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1586         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1587         default:            assert(false, "wrong type");
1588       }
1589       break;
1590     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1591     case Op_MulReductionVI:
1592       switch (typ) {
1593         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1594         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1595         default:            assert(false, "wrong type");
1596       }
1597       break;
1598     case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
1599     default:                assert(false, "wrong opcode");
1600   }
1601 }
1602 
1603 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1604                                   XMMRegister dst, XMMRegister src,
1605                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1606   switch (opcode) {
1607     case Op_AddReductionVF:
1608     case Op_MulReductionVF:
1609       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1610       break;
1611 
1612     case Op_AddReductionVD:
1613     case Op_MulReductionVD:
1614       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1615       break;
1616 
1617     default: assert(false, "wrong opcode");
1618   }
1619 }
1620 
1621 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1622                              Register dst, Register src1, XMMRegister src2,
1623                              XMMRegister vtmp1, XMMRegister vtmp2) {
1624   switch (vlen) {
1625     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1626     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1627     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1628     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1629 
1630     default: assert(false, "wrong vector length");
1631   }
1632 }
1633 
1634 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1635                              Register dst, Register src1, XMMRegister src2,
1636                              XMMRegister vtmp1, XMMRegister vtmp2) {
1637   switch (vlen) {
1638     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1639     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1640     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1641     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1642 
1643     default: assert(false, "wrong vector length");
1644   }
1645 }
1646 
1647 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1648                              Register dst, Register src1, XMMRegister src2,
1649                              XMMRegister vtmp1, XMMRegister vtmp2) {
1650   switch (vlen) {
1651     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1652     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1653     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1654     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1655 
1656     default: assert(false, "wrong vector length");
1657   }
1658 }
1659 
1660 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1661                              Register dst, Register src1, XMMRegister src2,
1662                              XMMRegister vtmp1, XMMRegister vtmp2) {
1663   switch (vlen) {
1664     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1665     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1666     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1667     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1668 
1669     default: assert(false, "wrong vector length");
1670   }
1671 }
1672 
1673 #ifdef _LP64
1674 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1675                              Register dst, Register src1, XMMRegister src2,
1676                              XMMRegister vtmp1, XMMRegister vtmp2) {
1677   switch (vlen) {
1678     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1679     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1680     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1681 
1682     default: assert(false, "wrong vector length");
1683   }
1684 }
1685 #endif // _LP64
1686 
1687 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1688   switch (vlen) {
1689     case 2:
1690       assert(vtmp2 == xnoreg, "");
1691       reduce2F(opcode, dst, src, vtmp1);
1692       break;
1693     case 4:
1694       assert(vtmp2 == xnoreg, "");
1695       reduce4F(opcode, dst, src, vtmp1);
1696       break;
1697     case 8:
1698       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1699       break;
1700     case 16:
1701       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1702       break;
1703     default: assert(false, "wrong vector length");
1704   }
1705 }
1706 
1707 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1708   switch (vlen) {
1709     case 2:
1710       assert(vtmp2 == xnoreg, "");
1711       reduce2D(opcode, dst, src, vtmp1);
1712       break;
1713     case 4:
1714       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1715       break;
1716     case 8:
1717       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1718       break;
1719     default: assert(false, "wrong vector length");
1720   }
1721 }
1722 
1723 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1724   if (opcode == Op_AddReductionVI) {
1725     if (vtmp1 != src2) {
1726       movdqu(vtmp1, src2);
1727     }
1728     phaddd(vtmp1, vtmp1);
1729   } else {
1730     pshufd(vtmp1, src2, 0x1);
1731     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1732   }
1733   movdl(vtmp2, src1);
1734   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1735   movdl(dst, vtmp1);
1736 }
1737 
1738 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1739   if (opcode == Op_AddReductionVI) {
1740     if (vtmp1 != src2) {
1741       movdqu(vtmp1, src2);
1742     }
1743     phaddd(vtmp1, src2);
1744     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1745   } else {
1746     pshufd(vtmp2, src2, 0xE);
1747     reduce_operation_128(T_INT, opcode, vtmp2, src2);
1748     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1749   }
1750 }
1751 
1752 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1753   if (opcode == Op_AddReductionVI) {
1754     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1755     vextracti128_high(vtmp2, vtmp1);
1756     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1757     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1758   } else {
1759     vextracti128_high(vtmp1, src2);
1760     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1761     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1762   }
1763 }
1764 
1765 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1766   vextracti64x4_high(vtmp2, src2);
1767   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
1768   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1769 }
1770 
1771 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1772   pshufd(vtmp2, src2, 0x1);
1773   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1774   movdqu(vtmp1, vtmp2);
1775   psrldq(vtmp1, 2);
1776   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1777   movdqu(vtmp2, vtmp1);
1778   psrldq(vtmp2, 1);
1779   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1780   movdl(vtmp2, src1);
1781   pmovsxbd(vtmp1, vtmp1);
1782   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1783   pextrb(dst, vtmp1, 0x0);
1784   movsbl(dst, dst);
1785 }
1786 
1787 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1788   pshufd(vtmp1, src2, 0xE);
1789   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
1790   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1791 }
1792 
1793 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1794   vextracti128_high(vtmp2, src2);
1795   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1796   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1797 }
1798 
1799 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1800   vextracti64x4_high(vtmp1, src2);
1801   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
1802   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1803 }
1804 
1805 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1806   pmovsxbw(vtmp2, src2);
1807   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1808 }
1809 
1810 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1811   if (UseAVX > 1) {
1812     int vector_len = Assembler::AVX_256bit;
1813     vpmovsxbw(vtmp1, src2, vector_len);
1814     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1815   } else {
1816     pmovsxbw(vtmp2, src2);
1817     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1818     pshufd(vtmp2, src2, 0x1);
1819     pmovsxbw(vtmp2, src2);
1820     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1821   }
1822 }
1823 
1824 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1825   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
1826     int vector_len = Assembler::AVX_512bit;
1827     vpmovsxbw(vtmp1, src2, vector_len);
1828     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1829   } else {
1830     assert(UseAVX >= 2,"Should not reach here.");
1831     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
1832     vextracti128_high(vtmp2, src2);
1833     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1834   }
1835 }
1836 
1837 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1838   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
1839   vextracti64x4_high(vtmp2, src2);
1840   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1841 }
1842 
1843 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1844   if (opcode == Op_AddReductionVI) {
1845     if (vtmp1 != src2) {
1846       movdqu(vtmp1, src2);
1847     }
1848     phaddw(vtmp1, vtmp1);
1849     phaddw(vtmp1, vtmp1);
1850   } else {
1851     pshufd(vtmp2, src2, 0x1);
1852     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1853     movdqu(vtmp1, vtmp2);
1854     psrldq(vtmp1, 2);
1855     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
1856   }
1857   movdl(vtmp2, src1);
1858   pmovsxwd(vtmp1, vtmp1);
1859   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1860   pextrw(dst, vtmp1, 0x0);
1861   movswl(dst, dst);
1862 }
1863 
1864 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1865   if (opcode == Op_AddReductionVI) {
1866     if (vtmp1 != src2) {
1867       movdqu(vtmp1, src2);
1868     }
1869     phaddw(vtmp1, src2);
1870   } else {
1871     pshufd(vtmp1, src2, 0xE);
1872     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
1873   }
1874   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1875 }
1876 
1877 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1878   if (opcode == Op_AddReductionVI) {
1879     int vector_len = Assembler::AVX_256bit;
1880     vphaddw(vtmp2, src2, src2, vector_len);
1881     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
1882   } else {
1883     vextracti128_high(vtmp2, src2);
1884     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1885   }
1886   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1887 }
1888 
1889 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1890   int vector_len = Assembler::AVX_256bit;
1891   vextracti64x4_high(vtmp1, src2);
1892   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
1893   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1894 }
1895 
1896 #ifdef _LP64
1897 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1898   pshufd(vtmp2, src2, 0xE);
1899   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
1900   movdq(vtmp1, src1);
1901   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
1902   movdq(dst, vtmp1);
1903 }
1904 
1905 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1906   vextracti128_high(vtmp1, src2);
1907   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
1908   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1909 }
1910 
1911 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1912   vextracti64x4_high(vtmp2, src2);
1913   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
1914   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1915 }
1916 
1917 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
1918   assert(ArrayOperationPartialInlineSize > 0 && ArrayOperationPartialInlineSize <= 64, "invalid");
1919   mov64(temp, -1L);
1920   bzhiq(temp, temp, len);
1921   kmovql(dst, temp);
1922 }
1923 #endif // _LP64
1924 
1925 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1926   reduce_operation_128(T_FLOAT, opcode, dst, src);
1927   pshufd(vtmp, src, 0x1);
1928   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1929 }
1930 
1931 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1932   reduce2F(opcode, dst, src, vtmp);
1933   pshufd(vtmp, src, 0x2);
1934   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1935   pshufd(vtmp, src, 0x3);
1936   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1937 }
1938 
1939 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1940   reduce4F(opcode, dst, src, vtmp2);
1941   vextractf128_high(vtmp2, src);
1942   reduce4F(opcode, dst, vtmp2, vtmp1);
1943 }
1944 
1945 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1946   reduce8F(opcode, dst, src, vtmp1, vtmp2);
1947   vextracti64x4_high(vtmp1, src);
1948   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
1949 }
1950 
1951 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1952   reduce_operation_128(T_DOUBLE, opcode, dst, src);
1953   pshufd(vtmp, src, 0xE);
1954   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
1955 }
1956 
1957 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1958   reduce2D(opcode, dst, src, vtmp2);
1959   vextractf128_high(vtmp2, src);
1960   reduce2D(opcode, dst, vtmp2, vtmp1);
1961 }
1962 
1963 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1964   reduce4D(opcode, dst, src, vtmp1, vtmp2);
1965   vextracti64x4_high(vtmp1, src);
1966   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
1967 }
1968 
1969 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) {
1970   MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
1971 }
1972 
1973 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) {
1974   MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
1975 }
1976 
1977 
1978 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
1979                                           XMMRegister dst, XMMRegister src,
1980                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1981                                           XMMRegister xmm_0, XMMRegister xmm_1) {
1982   int permconst[] = {1, 14};
1983   XMMRegister wsrc = src;
1984   XMMRegister wdst = xmm_0;
1985   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
1986 
1987   int vlen_enc = Assembler::AVX_128bit;
1988   if (vlen == 16) {
1989     vlen_enc = Assembler::AVX_256bit;
1990   }
1991 
1992   for (int i = log2(vlen) - 1; i >=0; i--) {
1993     if (i == 0 && !is_dst_valid) {
1994       wdst = dst;
1995     }
1996     if (i == 3) {
1997       vextracti64x4_high(wtmp, wsrc);
1998     } else if (i == 2) {
1999       vextracti128_high(wtmp, wsrc);
2000     } else { // i = [0,1]
2001       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2002     }
2003     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2004     wsrc = wdst;
2005     vlen_enc = Assembler::AVX_128bit;
2006   }
2007   if (is_dst_valid) {
2008     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2009   }
2010 }
2011 
2012 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2013                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2014                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2015   XMMRegister wsrc = src;
2016   XMMRegister wdst = xmm_0;
2017   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2018   int vlen_enc = Assembler::AVX_128bit;
2019   if (vlen == 8) {
2020     vlen_enc = Assembler::AVX_256bit;
2021   }
2022   for (int i = log2(vlen) - 1; i >=0; i--) {
2023     if (i == 0 && !is_dst_valid) {
2024       wdst = dst;
2025     }
2026     if (i == 1) {
2027       vextracti128_high(wtmp, wsrc);
2028     } else if (i == 2) {
2029       vextracti64x4_high(wtmp, wsrc);
2030     } else {
2031       assert(i == 0, "%d", i);
2032       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2033     }
2034     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2035     wsrc = wdst;
2036     vlen_enc = Assembler::AVX_128bit;
2037   }
2038   if (is_dst_valid) {
2039     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2040   }
2041 }
2042 
2043 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2044   switch (bt) {
2045     case T_BYTE:  pextrb(dst, src, idx); break;
2046     case T_SHORT: pextrw(dst, src, idx); break;
2047     case T_INT:   pextrd(dst, src, idx); break;
2048     case T_LONG:  pextrq(dst, src, idx); break;
2049 
2050     default:
2051       assert(false,"Should not reach here.");
2052       break;
2053   }
2054 }
2055 
2056 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2057   int esize =  type2aelembytes(typ);
2058   int elem_per_lane = 16/esize;
2059   int lane = elemindex / elem_per_lane;
2060   int eindex = elemindex % elem_per_lane;
2061 
2062   if (lane >= 2) {
2063     assert(UseAVX > 2, "required");
2064     vextractf32x4(dst, src, lane & 3);
2065     return dst;
2066   } else if (lane > 0) {
2067     assert(UseAVX > 0, "required");
2068     vextractf128(dst, src, lane);
2069     return dst;
2070   } else {
2071     return src;
2072   }
2073 }
2074 
2075 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2076   int esize =  type2aelembytes(typ);
2077   int elem_per_lane = 16/esize;
2078   int eindex = elemindex % elem_per_lane;
2079   assert(is_integral_type(typ),"required");
2080 
2081   if (eindex == 0) {
2082     if (typ == T_LONG) {
2083       movq(dst, src);
2084     } else {
2085       movdl(dst, src);
2086       if (typ == T_BYTE)
2087         movsbl(dst, dst);
2088       else if (typ == T_SHORT)
2089         movswl(dst, dst);
2090     }
2091   } else {
2092     extract(typ, dst, src, eindex);
2093   }
2094 }
2095 
2096 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) {
2097   int esize =  type2aelembytes(typ);
2098   int elem_per_lane = 16/esize;
2099   int eindex = elemindex % elem_per_lane;
2100   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2101 
2102   if (eindex == 0) {
2103     movq(dst, src);
2104   } else {
2105     if (typ == T_FLOAT) {
2106       if (UseAVX == 0) {
2107         movdqu(dst, src);
2108         pshufps(dst, dst, eindex);
2109       } else {
2110         vpshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2111       }
2112     } else {
2113       if (UseAVX == 0) {
2114         movdqu(dst, src);
2115         psrldq(dst, eindex*esize);
2116       } else {
2117         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2118       }
2119       movq(dst, dst);
2120     }
2121   }
2122   // Zero upper bits
2123   if (typ == T_FLOAT) {
2124     if (UseAVX == 0) {
2125       assert((vtmp != xnoreg) && (tmp != noreg), "required.");
2126       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp);
2127       pand(dst, vtmp);
2128     } else {
2129       assert((tmp != noreg), "required.");
2130       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp);
2131     }
2132   }
2133 }
2134 
2135 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2136   switch(typ) {
2137     case T_BYTE:
2138     case T_BOOLEAN:
2139       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2140       break;
2141     case T_SHORT:
2142     case T_CHAR:
2143       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2144       break;
2145     case T_INT:
2146     case T_FLOAT:
2147       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2148       break;
2149     case T_LONG:
2150     case T_DOUBLE:
2151       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2152       break;
2153     default:
2154       assert(false,"Should not reach here.");
2155       break;
2156   }
2157 }
2158 
2159 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) {
2160   switch(typ) {
2161     case T_BOOLEAN:
2162     case T_BYTE:
2163       evpcmpb(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2164       break;
2165     case T_CHAR:
2166     case T_SHORT:
2167       evpcmpw(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2168       break;
2169     case T_INT:
2170     case T_FLOAT:
2171       evpcmpd(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2172       break;
2173     case T_LONG:
2174     case T_DOUBLE:
2175       evpcmpq(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2176       break;
2177     default:
2178       assert(false,"Should not reach here.");
2179       break;
2180   }
2181 }
2182 
2183 void C2_MacroAssembler::vpcmpu(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison,
2184                             int vlen_in_bytes, XMMRegister vtmp1, XMMRegister vtmp2, Register scratch) {
2185   int vlen_enc = vector_length_encoding(vlen_in_bytes*2);
2186   switch (typ) {
2187   case T_BYTE:
2188     vpmovzxbw(vtmp1, src1, vlen_enc);
2189     vpmovzxbw(vtmp2, src2, vlen_enc);
2190     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2191     vpacksswb(dst, dst, dst, vlen_enc);
2192     break;
2193   case T_SHORT:
2194     vpmovzxwd(vtmp1, src1, vlen_enc);
2195     vpmovzxwd(vtmp2, src2, vlen_enc);
2196     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);
2197     vpackssdw(dst, dst, dst, vlen_enc);
2198     break;
2199   case T_INT:
2200     vpmovzxdq(vtmp1, src1, vlen_enc);
2201     vpmovzxdq(vtmp2, src2, vlen_enc);
2202     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);
2203     vpermilps(dst, dst, 8, vlen_enc);
2204     break;
2205   default:
2206     assert(false, "Should not reach here");
2207   }
2208   if (vlen_in_bytes == 16) {
2209     vpermpd(dst, dst, 0x8, vlen_enc);
2210   }
2211 }
2212 
2213 void C2_MacroAssembler::vpcmpu32(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison, int vlen_in_bytes,
2214                               XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, Register scratch) {
2215   int vlen_enc = vector_length_encoding(vlen_in_bytes);
2216   switch (typ) {
2217   case T_BYTE:
2218     vpmovzxbw(vtmp1, src1, vlen_enc);
2219     vpmovzxbw(vtmp2, src2, vlen_enc);
2220     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2221     vextracti128(vtmp1, src1, 1);
2222     vextracti128(vtmp2, src2, 1);
2223     vpmovzxbw(vtmp1, vtmp1, vlen_enc);
2224     vpmovzxbw(vtmp2, vtmp2, vlen_enc);
2225     vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2226     vpacksswb(dst, dst, vtmp3, vlen_enc);
2227     vpermpd(dst, dst, 0xd8, vlen_enc);
2228     break;
2229   case T_SHORT:
2230     vpmovzxwd(vtmp1, src1, vlen_enc);
2231     vpmovzxwd(vtmp2, src2, vlen_enc);
2232     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);
2233     vextracti128(vtmp1, src1, 1);
2234     vextracti128(vtmp2, src2, 1);
2235     vpmovzxwd(vtmp1, vtmp1, vlen_enc);
2236     vpmovzxwd(vtmp2, vtmp2, vlen_enc);
2237     vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::D,  vlen_enc, scratch);
2238     vpackssdw(dst, dst, vtmp3, vlen_enc);
2239     vpermpd(dst, dst, 0xd8, vlen_enc);
2240     break;
2241   case T_INT:
2242     vpmovzxdq(vtmp1, src1, vlen_enc);
2243     vpmovzxdq(vtmp2, src2, vlen_enc);
2244     vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);
2245     vpshufd(dst, dst, 8, vlen_enc);
2246     vpermq(dst, dst, 8, vlen_enc);
2247     vextracti128(vtmp1, src1, 1);
2248     vextracti128(vtmp2, src2, 1);
2249     vpmovzxdq(vtmp1, vtmp1, vlen_enc);
2250     vpmovzxdq(vtmp2, vtmp2, vlen_enc);
2251     vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::Q,  vlen_enc, scratch);
2252     vpshufd(vtmp3, vtmp3, 8, vlen_enc);
2253     vpermq(vtmp3, vtmp3, 0x80, vlen_enc);
2254     vpblendd(dst, dst, vtmp3, 0xf0, vlen_enc);
2255     break;
2256   default:
2257     assert(false, "Should not reach here");
2258   }
2259 }
2260 
2261 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2262   switch(typ) {
2263     case T_BYTE:
2264       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2265       break;
2266     case T_SHORT:
2267       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2268       break;
2269     case T_INT:
2270     case T_FLOAT:
2271       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2272       break;
2273     case T_LONG:
2274     case T_DOUBLE:
2275       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2276       break;
2277     default:
2278       assert(false,"Should not reach here.");
2279       break;
2280   }
2281 }
2282 
2283 void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2,
2284                                    XMMRegister vtmp1, XMMRegister vtmp2, KRegister mask) {
2285   switch(vlen) {
2286     case 4:
2287       assert(vtmp1 != xnoreg, "required.");
2288       // Broadcast lower 32 bits to 128 bits before ptest
2289       pshufd(vtmp1, src1, 0x0);
2290       if (bt == BoolTest::overflow) {
2291         assert(vtmp2 != xnoreg, "required.");
2292         pshufd(vtmp2, src2, 0x0);
2293       } else {
2294         assert(vtmp2 == xnoreg, "required.");
2295         vtmp2 = src2;
2296       }
2297       ptest(vtmp1, vtmp2);
2298      break;
2299     case 8:
2300       assert(vtmp1 != xnoreg, "required.");
2301       // Broadcast lower 64 bits to 128 bits before ptest
2302       pshufd(vtmp1, src1, 0x4);
2303       if (bt == BoolTest::overflow) {
2304         assert(vtmp2 != xnoreg, "required.");
2305         pshufd(vtmp2, src2, 0x4);
2306       } else {
2307         assert(vtmp2 == xnoreg, "required.");
2308         vtmp2 = src2;
2309       }
2310       ptest(vtmp1, vtmp2);
2311      break;
2312     case 16:
2313       assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2314       ptest(src1, src2);
2315       break;
2316     case 32:
2317       assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2318       vptest(src1, src2, Assembler::AVX_256bit);
2319       break;
2320     case 64:
2321       {
2322         assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2323         evpcmpeqb(mask, src1, src2, Assembler::AVX_512bit);
2324         if (bt == BoolTest::ne) {
2325           ktestql(mask, mask);
2326         } else {
2327           assert(bt == BoolTest::overflow, "required");
2328           kortestql(mask, mask);
2329         }
2330       }
2331       break;
2332     default:
2333       assert(false,"Should not reach here.");
2334       break;
2335   }
2336 }
2337 
2338 //-------------------------------------------------------------------------------------------
2339 
2340 // IndexOf for constant substrings with size >= 8 chars
2341 // which don't need to be loaded through stack.
2342 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2343                                          Register cnt1, Register cnt2,
2344                                          int int_cnt2,  Register result,
2345                                          XMMRegister vec, Register tmp,
2346                                          int ae) {
2347   ShortBranchVerifier sbv(this);
2348   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2349   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2350 
2351   // This method uses the pcmpestri instruction with bound registers
2352   //   inputs:
2353   //     xmm - substring
2354   //     rax - substring length (elements count)
2355   //     mem - scanned string
2356   //     rdx - string length (elements count)
2357   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2358   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2359   //   outputs:
2360   //     rcx - matched index in string
2361   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2362   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2363   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2364   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2365   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2366 
2367   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2368         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2369         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2370 
2371   // Note, inline_string_indexOf() generates checks:
2372   // if (substr.count > string.count) return -1;
2373   // if (substr.count == 0) return 0;
2374   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2375 
2376   // Load substring.
2377   if (ae == StrIntrinsicNode::UL) {
2378     pmovzxbw(vec, Address(str2, 0));
2379   } else {
2380     movdqu(vec, Address(str2, 0));
2381   }
2382   movl(cnt2, int_cnt2);
2383   movptr(result, str1); // string addr
2384 
2385   if (int_cnt2 > stride) {
2386     jmpb(SCAN_TO_SUBSTR);
2387 
2388     // Reload substr for rescan, this code
2389     // is executed only for large substrings (> 8 chars)
2390     bind(RELOAD_SUBSTR);
2391     if (ae == StrIntrinsicNode::UL) {
2392       pmovzxbw(vec, Address(str2, 0));
2393     } else {
2394       movdqu(vec, Address(str2, 0));
2395     }
2396     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2397 
2398     bind(RELOAD_STR);
2399     // We came here after the beginning of the substring was
2400     // matched but the rest of it was not so we need to search
2401     // again. Start from the next element after the previous match.
2402 
2403     // cnt2 is number of substring reminding elements and
2404     // cnt1 is number of string reminding elements when cmp failed.
2405     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2406     subl(cnt1, cnt2);
2407     addl(cnt1, int_cnt2);
2408     movl(cnt2, int_cnt2); // Now restore cnt2
2409 
2410     decrementl(cnt1);     // Shift to next element
2411     cmpl(cnt1, cnt2);
2412     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2413 
2414     addptr(result, (1<<scale1));
2415 
2416   } // (int_cnt2 > 8)
2417 
2418   // Scan string for start of substr in 16-byte vectors
2419   bind(SCAN_TO_SUBSTR);
2420   pcmpestri(vec, Address(result, 0), mode);
2421   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2422   subl(cnt1, stride);
2423   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2424   cmpl(cnt1, cnt2);
2425   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2426   addptr(result, 16);
2427   jmpb(SCAN_TO_SUBSTR);
2428 
2429   // Found a potential substr
2430   bind(FOUND_CANDIDATE);
2431   // Matched whole vector if first element matched (tmp(rcx) == 0).
2432   if (int_cnt2 == stride) {
2433     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2434   } else { // int_cnt2 > 8
2435     jccb(Assembler::overflow, FOUND_SUBSTR);
2436   }
2437   // After pcmpestri tmp(rcx) contains matched element index
2438   // Compute start addr of substr
2439   lea(result, Address(result, tmp, scale1));
2440 
2441   // Make sure string is still long enough
2442   subl(cnt1, tmp);
2443   cmpl(cnt1, cnt2);
2444   if (int_cnt2 == stride) {
2445     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2446   } else { // int_cnt2 > 8
2447     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2448   }
2449   // Left less then substring.
2450 
2451   bind(RET_NOT_FOUND);
2452   movl(result, -1);
2453   jmp(EXIT);
2454 
2455   if (int_cnt2 > stride) {
2456     // This code is optimized for the case when whole substring
2457     // is matched if its head is matched.
2458     bind(MATCH_SUBSTR_HEAD);
2459     pcmpestri(vec, Address(result, 0), mode);
2460     // Reload only string if does not match
2461     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2462 
2463     Label CONT_SCAN_SUBSTR;
2464     // Compare the rest of substring (> 8 chars).
2465     bind(FOUND_SUBSTR);
2466     // First 8 chars are already matched.
2467     negptr(cnt2);
2468     addptr(cnt2, stride);
2469 
2470     bind(SCAN_SUBSTR);
2471     subl(cnt1, stride);
2472     cmpl(cnt2, -stride); // Do not read beyond substring
2473     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2474     // Back-up strings to avoid reading beyond substring:
2475     // cnt1 = cnt1 - cnt2 + 8
2476     addl(cnt1, cnt2); // cnt2 is negative
2477     addl(cnt1, stride);
2478     movl(cnt2, stride); negptr(cnt2);
2479     bind(CONT_SCAN_SUBSTR);
2480     if (int_cnt2 < (int)G) {
2481       int tail_off1 = int_cnt2<<scale1;
2482       int tail_off2 = int_cnt2<<scale2;
2483       if (ae == StrIntrinsicNode::UL) {
2484         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2485       } else {
2486         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2487       }
2488       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2489     } else {
2490       // calculate index in register to avoid integer overflow (int_cnt2*2)
2491       movl(tmp, int_cnt2);
2492       addptr(tmp, cnt2);
2493       if (ae == StrIntrinsicNode::UL) {
2494         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2495       } else {
2496         movdqu(vec, Address(str2, tmp, scale2, 0));
2497       }
2498       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2499     }
2500     // Need to reload strings pointers if not matched whole vector
2501     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2502     addptr(cnt2, stride);
2503     jcc(Assembler::negative, SCAN_SUBSTR);
2504     // Fall through if found full substring
2505 
2506   } // (int_cnt2 > 8)
2507 
2508   bind(RET_FOUND);
2509   // Found result if we matched full small substring.
2510   // Compute substr offset
2511   subptr(result, str1);
2512   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2513     shrl(result, 1); // index
2514   }
2515   bind(EXIT);
2516 
2517 } // string_indexofC8
2518 
2519 // Small strings are loaded through stack if they cross page boundary.
2520 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2521                                        Register cnt1, Register cnt2,
2522                                        int int_cnt2,  Register result,
2523                                        XMMRegister vec, Register tmp,
2524                                        int ae) {
2525   ShortBranchVerifier sbv(this);
2526   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2527   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2528 
2529   //
2530   // int_cnt2 is length of small (< 8 chars) constant substring
2531   // or (-1) for non constant substring in which case its length
2532   // is in cnt2 register.
2533   //
2534   // Note, inline_string_indexOf() generates checks:
2535   // if (substr.count > string.count) return -1;
2536   // if (substr.count == 0) return 0;
2537   //
2538   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2539   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2540   // This method uses the pcmpestri instruction with bound registers
2541   //   inputs:
2542   //     xmm - substring
2543   //     rax - substring length (elements count)
2544   //     mem - scanned string
2545   //     rdx - string length (elements count)
2546   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2547   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2548   //   outputs:
2549   //     rcx - matched index in string
2550   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2551   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2552   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2553   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2554 
2555   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2556         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2557         FOUND_CANDIDATE;
2558 
2559   { //========================================================
2560     // We don't know where these strings are located
2561     // and we can't read beyond them. Load them through stack.
2562     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2563 
2564     movptr(tmp, rsp); // save old SP
2565 
2566     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2567       if (int_cnt2 == (1>>scale2)) { // One byte
2568         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2569         load_unsigned_byte(result, Address(str2, 0));
2570         movdl(vec, result); // move 32 bits
2571       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2572         // Not enough header space in 32-bit VM: 12+3 = 15.
2573         movl(result, Address(str2, -1));
2574         shrl(result, 8);
2575         movdl(vec, result); // move 32 bits
2576       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2577         load_unsigned_short(result, Address(str2, 0));
2578         movdl(vec, result); // move 32 bits
2579       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2580         movdl(vec, Address(str2, 0)); // move 32 bits
2581       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2582         movq(vec, Address(str2, 0));  // move 64 bits
2583       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2584         // Array header size is 12 bytes in 32-bit VM
2585         // + 6 bytes for 3 chars == 18 bytes,
2586         // enough space to load vec and shift.
2587         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2588         if (ae == StrIntrinsicNode::UL) {
2589           int tail_off = int_cnt2-8;
2590           pmovzxbw(vec, Address(str2, tail_off));
2591           psrldq(vec, -2*tail_off);
2592         }
2593         else {
2594           int tail_off = int_cnt2*(1<<scale2);
2595           movdqu(vec, Address(str2, tail_off-16));
2596           psrldq(vec, 16-tail_off);
2597         }
2598       }
2599     } else { // not constant substring
2600       cmpl(cnt2, stride);
2601       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2602 
2603       // We can read beyond string if srt+16 does not cross page boundary
2604       // since heaps are aligned and mapped by pages.
2605       assert(os::vm_page_size() < (int)G, "default page should be small");
2606       movl(result, str2); // We need only low 32 bits
2607       andl(result, (os::vm_page_size()-1));
2608       cmpl(result, (os::vm_page_size()-16));
2609       jccb(Assembler::belowEqual, CHECK_STR);
2610 
2611       // Move small strings to stack to allow load 16 bytes into vec.
2612       subptr(rsp, 16);
2613       int stk_offset = wordSize-(1<<scale2);
2614       push(cnt2);
2615 
2616       bind(COPY_SUBSTR);
2617       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2618         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2619         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2620       } else if (ae == StrIntrinsicNode::UU) {
2621         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2622         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2623       }
2624       decrement(cnt2);
2625       jccb(Assembler::notZero, COPY_SUBSTR);
2626 
2627       pop(cnt2);
2628       movptr(str2, rsp);  // New substring address
2629     } // non constant
2630 
2631     bind(CHECK_STR);
2632     cmpl(cnt1, stride);
2633     jccb(Assembler::aboveEqual, BIG_STRINGS);
2634 
2635     // Check cross page boundary.
2636     movl(result, str1); // We need only low 32 bits
2637     andl(result, (os::vm_page_size()-1));
2638     cmpl(result, (os::vm_page_size()-16));
2639     jccb(Assembler::belowEqual, BIG_STRINGS);
2640 
2641     subptr(rsp, 16);
2642     int stk_offset = -(1<<scale1);
2643     if (int_cnt2 < 0) { // not constant
2644       push(cnt2);
2645       stk_offset += wordSize;
2646     }
2647     movl(cnt2, cnt1);
2648 
2649     bind(COPY_STR);
2650     if (ae == StrIntrinsicNode::LL) {
2651       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2652       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2653     } else {
2654       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2655       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2656     }
2657     decrement(cnt2);
2658     jccb(Assembler::notZero, COPY_STR);
2659 
2660     if (int_cnt2 < 0) { // not constant
2661       pop(cnt2);
2662     }
2663     movptr(str1, rsp);  // New string address
2664 
2665     bind(BIG_STRINGS);
2666     // Load substring.
2667     if (int_cnt2 < 0) { // -1
2668       if (ae == StrIntrinsicNode::UL) {
2669         pmovzxbw(vec, Address(str2, 0));
2670       } else {
2671         movdqu(vec, Address(str2, 0));
2672       }
2673       push(cnt2);       // substr count
2674       push(str2);       // substr addr
2675       push(str1);       // string addr
2676     } else {
2677       // Small (< 8 chars) constant substrings are loaded already.
2678       movl(cnt2, int_cnt2);
2679     }
2680     push(tmp);  // original SP
2681 
2682   } // Finished loading
2683 
2684   //========================================================
2685   // Start search
2686   //
2687 
2688   movptr(result, str1); // string addr
2689 
2690   if (int_cnt2  < 0) {  // Only for non constant substring
2691     jmpb(SCAN_TO_SUBSTR);
2692 
2693     // SP saved at sp+0
2694     // String saved at sp+1*wordSize
2695     // Substr saved at sp+2*wordSize
2696     // Substr count saved at sp+3*wordSize
2697 
2698     // Reload substr for rescan, this code
2699     // is executed only for large substrings (> 8 chars)
2700     bind(RELOAD_SUBSTR);
2701     movptr(str2, Address(rsp, 2*wordSize));
2702     movl(cnt2, Address(rsp, 3*wordSize));
2703     if (ae == StrIntrinsicNode::UL) {
2704       pmovzxbw(vec, Address(str2, 0));
2705     } else {
2706       movdqu(vec, Address(str2, 0));
2707     }
2708     // We came here after the beginning of the substring was
2709     // matched but the rest of it was not so we need to search
2710     // again. Start from the next element after the previous match.
2711     subptr(str1, result); // Restore counter
2712     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2713       shrl(str1, 1);
2714     }
2715     addl(cnt1, str1);
2716     decrementl(cnt1);   // Shift to next element
2717     cmpl(cnt1, cnt2);
2718     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2719 
2720     addptr(result, (1<<scale1));
2721   } // non constant
2722 
2723   // Scan string for start of substr in 16-byte vectors
2724   bind(SCAN_TO_SUBSTR);
2725   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2726   pcmpestri(vec, Address(result, 0), mode);
2727   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2728   subl(cnt1, stride);
2729   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2730   cmpl(cnt1, cnt2);
2731   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2732   addptr(result, 16);
2733 
2734   bind(ADJUST_STR);
2735   cmpl(cnt1, stride); // Do not read beyond string
2736   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2737   // Back-up string to avoid reading beyond string.
2738   lea(result, Address(result, cnt1, scale1, -16));
2739   movl(cnt1, stride);
2740   jmpb(SCAN_TO_SUBSTR);
2741 
2742   // Found a potential substr
2743   bind(FOUND_CANDIDATE);
2744   // After pcmpestri tmp(rcx) contains matched element index
2745 
2746   // Make sure string is still long enough
2747   subl(cnt1, tmp);
2748   cmpl(cnt1, cnt2);
2749   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
2750   // Left less then substring.
2751 
2752   bind(RET_NOT_FOUND);
2753   movl(result, -1);
2754   jmp(CLEANUP);
2755 
2756   bind(FOUND_SUBSTR);
2757   // Compute start addr of substr
2758   lea(result, Address(result, tmp, scale1));
2759   if (int_cnt2 > 0) { // Constant substring
2760     // Repeat search for small substring (< 8 chars)
2761     // from new point without reloading substring.
2762     // Have to check that we don't read beyond string.
2763     cmpl(tmp, stride-int_cnt2);
2764     jccb(Assembler::greater, ADJUST_STR);
2765     // Fall through if matched whole substring.
2766   } else { // non constant
2767     assert(int_cnt2 == -1, "should be != 0");
2768 
2769     addl(tmp, cnt2);
2770     // Found result if we matched whole substring.
2771     cmpl(tmp, stride);
2772     jcc(Assembler::lessEqual, RET_FOUND);
2773 
2774     // Repeat search for small substring (<= 8 chars)
2775     // from new point 'str1' without reloading substring.
2776     cmpl(cnt2, stride);
2777     // Have to check that we don't read beyond string.
2778     jccb(Assembler::lessEqual, ADJUST_STR);
2779 
2780     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
2781     // Compare the rest of substring (> 8 chars).
2782     movptr(str1, result);
2783 
2784     cmpl(tmp, cnt2);
2785     // First 8 chars are already matched.
2786     jccb(Assembler::equal, CHECK_NEXT);
2787 
2788     bind(SCAN_SUBSTR);
2789     pcmpestri(vec, Address(str1, 0), mode);
2790     // Need to reload strings pointers if not matched whole vector
2791     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2792 
2793     bind(CHECK_NEXT);
2794     subl(cnt2, stride);
2795     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
2796     addptr(str1, 16);
2797     if (ae == StrIntrinsicNode::UL) {
2798       addptr(str2, 8);
2799     } else {
2800       addptr(str2, 16);
2801     }
2802     subl(cnt1, stride);
2803     cmpl(cnt2, stride); // Do not read beyond substring
2804     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
2805     // Back-up strings to avoid reading beyond substring.
2806 
2807     if (ae == StrIntrinsicNode::UL) {
2808       lea(str2, Address(str2, cnt2, scale2, -8));
2809       lea(str1, Address(str1, cnt2, scale1, -16));
2810     } else {
2811       lea(str2, Address(str2, cnt2, scale2, -16));
2812       lea(str1, Address(str1, cnt2, scale1, -16));
2813     }
2814     subl(cnt1, cnt2);
2815     movl(cnt2, stride);
2816     addl(cnt1, stride);
2817     bind(CONT_SCAN_SUBSTR);
2818     if (ae == StrIntrinsicNode::UL) {
2819       pmovzxbw(vec, Address(str2, 0));
2820     } else {
2821       movdqu(vec, Address(str2, 0));
2822     }
2823     jmp(SCAN_SUBSTR);
2824 
2825     bind(RET_FOUND_LONG);
2826     movptr(str1, Address(rsp, wordSize));
2827   } // non constant
2828 
2829   bind(RET_FOUND);
2830   // Compute substr offset
2831   subptr(result, str1);
2832   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2833     shrl(result, 1); // index
2834   }
2835   bind(CLEANUP);
2836   pop(rsp); // restore SP
2837 
2838 } // string_indexof
2839 
2840 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
2841                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
2842   ShortBranchVerifier sbv(this);
2843   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2844 
2845   int stride = 8;
2846 
2847   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
2848         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
2849         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
2850         FOUND_SEQ_CHAR, DONE_LABEL;
2851 
2852   movptr(result, str1);
2853   if (UseAVX >= 2) {
2854     cmpl(cnt1, stride);
2855     jcc(Assembler::less, SCAN_TO_CHAR);
2856     cmpl(cnt1, 2*stride);
2857     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
2858     movdl(vec1, ch);
2859     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
2860     vpxor(vec2, vec2);
2861     movl(tmp, cnt1);
2862     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
2863     andl(cnt1,0x0000000F);  //tail count (in chars)
2864 
2865     bind(SCAN_TO_16_CHAR_LOOP);
2866     vmovdqu(vec3, Address(result, 0));
2867     vpcmpeqw(vec3, vec3, vec1, 1);
2868     vptest(vec2, vec3);
2869     jcc(Assembler::carryClear, FOUND_CHAR);
2870     addptr(result, 32);
2871     subl(tmp, 2*stride);
2872     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
2873     jmp(SCAN_TO_8_CHAR);
2874     bind(SCAN_TO_8_CHAR_INIT);
2875     movdl(vec1, ch);
2876     pshuflw(vec1, vec1, 0x00);
2877     pshufd(vec1, vec1, 0);
2878     pxor(vec2, vec2);
2879   }
2880   bind(SCAN_TO_8_CHAR);
2881   cmpl(cnt1, stride);
2882   jcc(Assembler::less, SCAN_TO_CHAR);
2883   if (UseAVX < 2) {
2884     movdl(vec1, ch);
2885     pshuflw(vec1, vec1, 0x00);
2886     pshufd(vec1, vec1, 0);
2887     pxor(vec2, vec2);
2888   }
2889   movl(tmp, cnt1);
2890   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
2891   andl(cnt1,0x00000007);  //tail count (in chars)
2892 
2893   bind(SCAN_TO_8_CHAR_LOOP);
2894   movdqu(vec3, Address(result, 0));
2895   pcmpeqw(vec3, vec1);
2896   ptest(vec2, vec3);
2897   jcc(Assembler::carryClear, FOUND_CHAR);
2898   addptr(result, 16);
2899   subl(tmp, stride);
2900   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
2901   bind(SCAN_TO_CHAR);
2902   testl(cnt1, cnt1);
2903   jcc(Assembler::zero, RET_NOT_FOUND);
2904   bind(SCAN_TO_CHAR_LOOP);
2905   load_unsigned_short(tmp, Address(result, 0));
2906   cmpl(ch, tmp);
2907   jccb(Assembler::equal, FOUND_SEQ_CHAR);
2908   addptr(result, 2);
2909   subl(cnt1, 1);
2910   jccb(Assembler::zero, RET_NOT_FOUND);
2911   jmp(SCAN_TO_CHAR_LOOP);
2912 
2913   bind(RET_NOT_FOUND);
2914   movl(result, -1);
2915   jmpb(DONE_LABEL);
2916 
2917   bind(FOUND_CHAR);
2918   if (UseAVX >= 2) {
2919     vpmovmskb(tmp, vec3);
2920   } else {
2921     pmovmskb(tmp, vec3);
2922   }
2923   bsfl(ch, tmp);
2924   addptr(result, ch);
2925 
2926   bind(FOUND_SEQ_CHAR);
2927   subptr(result, str1);
2928   shrl(result, 1);
2929 
2930   bind(DONE_LABEL);
2931 } // string_indexof_char
2932 
2933 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
2934                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
2935   ShortBranchVerifier sbv(this);
2936   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2937 
2938   int stride = 16;
2939 
2940   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
2941         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
2942         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
2943         FOUND_SEQ_CHAR, DONE_LABEL;
2944 
2945   movptr(result, str1);
2946   if (UseAVX >= 2) {
2947     cmpl(cnt1, stride);
2948     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
2949     cmpl(cnt1, stride*2);
2950     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
2951     movdl(vec1, ch);
2952     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
2953     vpxor(vec2, vec2);
2954     movl(tmp, cnt1);
2955     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
2956     andl(cnt1,0x0000001F);  //tail count (in chars)
2957 
2958     bind(SCAN_TO_32_CHAR_LOOP);
2959     vmovdqu(vec3, Address(result, 0));
2960     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
2961     vptest(vec2, vec3);
2962     jcc(Assembler::carryClear, FOUND_CHAR);
2963     addptr(result, 32);
2964     subl(tmp, stride*2);
2965     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
2966     jmp(SCAN_TO_16_CHAR);
2967 
2968     bind(SCAN_TO_16_CHAR_INIT);
2969     movdl(vec1, ch);
2970     pxor(vec2, vec2);
2971     pshufb(vec1, vec2);
2972   }
2973 
2974   bind(SCAN_TO_16_CHAR);
2975   cmpl(cnt1, stride);
2976   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entires left
2977   if (UseAVX < 2) {
2978     movdl(vec1, ch);
2979     pxor(vec2, vec2);
2980     pshufb(vec1, vec2);
2981   }
2982   movl(tmp, cnt1);
2983   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
2984   andl(cnt1,0x0000000F);  //tail count (in bytes)
2985 
2986   bind(SCAN_TO_16_CHAR_LOOP);
2987   movdqu(vec3, Address(result, 0));
2988   pcmpeqb(vec3, vec1);
2989   ptest(vec2, vec3);
2990   jcc(Assembler::carryClear, FOUND_CHAR);
2991   addptr(result, 16);
2992   subl(tmp, stride);
2993   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
2994 
2995   bind(SCAN_TO_CHAR_INIT);
2996   testl(cnt1, cnt1);
2997   jcc(Assembler::zero, RET_NOT_FOUND);
2998   bind(SCAN_TO_CHAR_LOOP);
2999   load_unsigned_byte(tmp, Address(result, 0));
3000   cmpl(ch, tmp);
3001   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3002   addptr(result, 1);
3003   subl(cnt1, 1);
3004   jccb(Assembler::zero, RET_NOT_FOUND);
3005   jmp(SCAN_TO_CHAR_LOOP);
3006 
3007   bind(RET_NOT_FOUND);
3008   movl(result, -1);
3009   jmpb(DONE_LABEL);
3010 
3011   bind(FOUND_CHAR);
3012   if (UseAVX >= 2) {
3013     vpmovmskb(tmp, vec3);
3014   } else {
3015     pmovmskb(tmp, vec3);
3016   }
3017   bsfl(ch, tmp);
3018   addptr(result, ch);
3019 
3020   bind(FOUND_SEQ_CHAR);
3021   subptr(result, str1);
3022 
3023   bind(DONE_LABEL);
3024 } // stringL_indexof_char
3025 
3026 // helper function for string_compare
3027 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3028                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3029                                            Address::ScaleFactor scale2, Register index, int ae) {
3030   if (ae == StrIntrinsicNode::LL) {
3031     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3032     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3033   } else if (ae == StrIntrinsicNode::UU) {
3034     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3035     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3036   } else {
3037     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3038     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3039   }
3040 }
3041 
3042 // Compare strings, used for char[] and byte[].
3043 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3044                                        Register cnt1, Register cnt2, Register result,
3045                                        XMMRegister vec1, int ae, KRegister mask) {
3046   ShortBranchVerifier sbv(this);
3047   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3048   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3049   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3050   int stride2x2 = 0x40;
3051   Address::ScaleFactor scale = Address::no_scale;
3052   Address::ScaleFactor scale1 = Address::no_scale;
3053   Address::ScaleFactor scale2 = Address::no_scale;
3054 
3055   if (ae != StrIntrinsicNode::LL) {
3056     stride2x2 = 0x20;
3057   }
3058 
3059   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3060     shrl(cnt2, 1);
3061   }
3062   // Compute the minimum of the string lengths and the
3063   // difference of the string lengths (stack).
3064   // Do the conditional move stuff
3065   movl(result, cnt1);
3066   subl(cnt1, cnt2);
3067   push(cnt1);
3068   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3069 
3070   // Is the minimum length zero?
3071   testl(cnt2, cnt2);
3072   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3073   if (ae == StrIntrinsicNode::LL) {
3074     // Load first bytes
3075     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3076     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3077   } else if (ae == StrIntrinsicNode::UU) {
3078     // Load first characters
3079     load_unsigned_short(result, Address(str1, 0));
3080     load_unsigned_short(cnt1, Address(str2, 0));
3081   } else {
3082     load_unsigned_byte(result, Address(str1, 0));
3083     load_unsigned_short(cnt1, Address(str2, 0));
3084   }
3085   subl(result, cnt1);
3086   jcc(Assembler::notZero,  POP_LABEL);
3087 
3088   if (ae == StrIntrinsicNode::UU) {
3089     // Divide length by 2 to get number of chars
3090     shrl(cnt2, 1);
3091   }
3092   cmpl(cnt2, 1);
3093   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3094 
3095   // Check if the strings start at the same location and setup scale and stride
3096   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3097     cmpptr(str1, str2);
3098     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3099     if (ae == StrIntrinsicNode::LL) {
3100       scale = Address::times_1;
3101       stride = 16;
3102     } else {
3103       scale = Address::times_2;
3104       stride = 8;
3105     }
3106   } else {
3107     scale1 = Address::times_1;
3108     scale2 = Address::times_2;
3109     // scale not used
3110     stride = 8;
3111   }
3112 
3113   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3114     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3115     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3116     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3117     Label COMPARE_TAIL_LONG;
3118     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3119 
3120     int pcmpmask = 0x19;
3121     if (ae == StrIntrinsicNode::LL) {
3122       pcmpmask &= ~0x01;
3123     }
3124 
3125     // Setup to compare 16-chars (32-bytes) vectors,
3126     // start from first character again because it has aligned address.
3127     if (ae == StrIntrinsicNode::LL) {
3128       stride2 = 32;
3129     } else {
3130       stride2 = 16;
3131     }
3132     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3133       adr_stride = stride << scale;
3134     } else {
3135       adr_stride1 = 8;  //stride << scale1;
3136       adr_stride2 = 16; //stride << scale2;
3137     }
3138 
3139     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3140     // rax and rdx are used by pcmpestri as elements counters
3141     movl(result, cnt2);
3142     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3143     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3144 
3145     // fast path : compare first 2 8-char vectors.
3146     bind(COMPARE_16_CHARS);
3147     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3148       movdqu(vec1, Address(str1, 0));
3149     } else {
3150       pmovzxbw(vec1, Address(str1, 0));
3151     }
3152     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3153     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3154 
3155     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3156       movdqu(vec1, Address(str1, adr_stride));
3157       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3158     } else {
3159       pmovzxbw(vec1, Address(str1, adr_stride1));
3160       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3161     }
3162     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3163     addl(cnt1, stride);
3164 
3165     // Compare the characters at index in cnt1
3166     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3167     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3168     subl(result, cnt2);
3169     jmp(POP_LABEL);
3170 
3171     // Setup the registers to start vector comparison loop
3172     bind(COMPARE_WIDE_VECTORS);
3173     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3174       lea(str1, Address(str1, result, scale));
3175       lea(str2, Address(str2, result, scale));
3176     } else {
3177       lea(str1, Address(str1, result, scale1));
3178       lea(str2, Address(str2, result, scale2));
3179     }
3180     subl(result, stride2);
3181     subl(cnt2, stride2);
3182     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3183     negptr(result);
3184 
3185     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3186     bind(COMPARE_WIDE_VECTORS_LOOP);
3187 
3188 #ifdef _LP64
3189     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3190       cmpl(cnt2, stride2x2);
3191       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3192       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3193       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3194 
3195       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3196       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3197         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3198         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3199       } else {
3200         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3201         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3202       }
3203       kortestql(mask, mask);
3204       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3205       addptr(result, stride2x2);  // update since we already compared at this addr
3206       subl(cnt2, stride2x2);      // and sub the size too
3207       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3208 
3209       vpxor(vec1, vec1);
3210       jmpb(COMPARE_WIDE_TAIL);
3211     }//if (VM_Version::supports_avx512vlbw())
3212 #endif // _LP64
3213 
3214 
3215     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3216     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3217       vmovdqu(vec1, Address(str1, result, scale));
3218       vpxor(vec1, Address(str2, result, scale));
3219     } else {
3220       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3221       vpxor(vec1, Address(str2, result, scale2));
3222     }
3223     vptest(vec1, vec1);
3224     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3225     addptr(result, stride2);
3226     subl(cnt2, stride2);
3227     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3228     // clean upper bits of YMM registers
3229     vpxor(vec1, vec1);
3230 
3231     // compare wide vectors tail
3232     bind(COMPARE_WIDE_TAIL);
3233     testptr(result, result);
3234     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3235 
3236     movl(result, stride2);
3237     movl(cnt2, result);
3238     negptr(result);
3239     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3240 
3241     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3242     bind(VECTOR_NOT_EQUAL);
3243     // clean upper bits of YMM registers
3244     vpxor(vec1, vec1);
3245     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3246       lea(str1, Address(str1, result, scale));
3247       lea(str2, Address(str2, result, scale));
3248     } else {
3249       lea(str1, Address(str1, result, scale1));
3250       lea(str2, Address(str2, result, scale2));
3251     }
3252     jmp(COMPARE_16_CHARS);
3253 
3254     // Compare tail chars, length between 1 to 15 chars
3255     bind(COMPARE_TAIL_LONG);
3256     movl(cnt2, result);
3257     cmpl(cnt2, stride);
3258     jcc(Assembler::less, COMPARE_SMALL_STR);
3259 
3260     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3261       movdqu(vec1, Address(str1, 0));
3262     } else {
3263       pmovzxbw(vec1, Address(str1, 0));
3264     }
3265     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3266     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3267     subptr(cnt2, stride);
3268     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3269     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3270       lea(str1, Address(str1, result, scale));
3271       lea(str2, Address(str2, result, scale));
3272     } else {
3273       lea(str1, Address(str1, result, scale1));
3274       lea(str2, Address(str2, result, scale2));
3275     }
3276     negptr(cnt2);
3277     jmpb(WHILE_HEAD_LABEL);
3278 
3279     bind(COMPARE_SMALL_STR);
3280   } else if (UseSSE42Intrinsics) {
3281     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3282     int pcmpmask = 0x19;
3283     // Setup to compare 8-char (16-byte) vectors,
3284     // start from first character again because it has aligned address.
3285     movl(result, cnt2);
3286     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3287     if (ae == StrIntrinsicNode::LL) {
3288       pcmpmask &= ~0x01;
3289     }
3290     jcc(Assembler::zero, COMPARE_TAIL);
3291     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3292       lea(str1, Address(str1, result, scale));
3293       lea(str2, Address(str2, result, scale));
3294     } else {
3295       lea(str1, Address(str1, result, scale1));
3296       lea(str2, Address(str2, result, scale2));
3297     }
3298     negptr(result);
3299 
3300     // pcmpestri
3301     //   inputs:
3302     //     vec1- substring
3303     //     rax - negative string length (elements count)
3304     //     mem - scanned string
3305     //     rdx - string length (elements count)
3306     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3307     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3308     //   outputs:
3309     //     rcx - first mismatched element index
3310     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3311 
3312     bind(COMPARE_WIDE_VECTORS);
3313     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3314       movdqu(vec1, Address(str1, result, scale));
3315       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3316     } else {
3317       pmovzxbw(vec1, Address(str1, result, scale1));
3318       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3319     }
3320     // After pcmpestri cnt1(rcx) contains mismatched element index
3321 
3322     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3323     addptr(result, stride);
3324     subptr(cnt2, stride);
3325     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3326 
3327     // compare wide vectors tail
3328     testptr(result, result);
3329     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3330 
3331     movl(cnt2, stride);
3332     movl(result, stride);
3333     negptr(result);
3334     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3335       movdqu(vec1, Address(str1, result, scale));
3336       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3337     } else {
3338       pmovzxbw(vec1, Address(str1, result, scale1));
3339       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3340     }
3341     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3342 
3343     // Mismatched characters in the vectors
3344     bind(VECTOR_NOT_EQUAL);
3345     addptr(cnt1, result);
3346     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3347     subl(result, cnt2);
3348     jmpb(POP_LABEL);
3349 
3350     bind(COMPARE_TAIL); // limit is zero
3351     movl(cnt2, result);
3352     // Fallthru to tail compare
3353   }
3354   // Shift str2 and str1 to the end of the arrays, negate min
3355   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3356     lea(str1, Address(str1, cnt2, scale));
3357     lea(str2, Address(str2, cnt2, scale));
3358   } else {
3359     lea(str1, Address(str1, cnt2, scale1));
3360     lea(str2, Address(str2, cnt2, scale2));
3361   }
3362   decrementl(cnt2);  // first character was compared already
3363   negptr(cnt2);
3364 
3365   // Compare the rest of the elements
3366   bind(WHILE_HEAD_LABEL);
3367   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3368   subl(result, cnt1);
3369   jccb(Assembler::notZero, POP_LABEL);
3370   increment(cnt2);
3371   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3372 
3373   // Strings are equal up to min length.  Return the length difference.
3374   bind(LENGTH_DIFF_LABEL);
3375   pop(result);
3376   if (ae == StrIntrinsicNode::UU) {
3377     // Divide diff by 2 to get number of chars
3378     sarl(result, 1);
3379   }
3380   jmpb(DONE_LABEL);
3381 
3382 #ifdef _LP64
3383   if (VM_Version::supports_avx512vlbw()) {
3384 
3385     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3386 
3387     kmovql(cnt1, mask);
3388     notq(cnt1);
3389     bsfq(cnt2, cnt1);
3390     if (ae != StrIntrinsicNode::LL) {
3391       // Divide diff by 2 to get number of chars
3392       sarl(cnt2, 1);
3393     }
3394     addq(result, cnt2);
3395     if (ae == StrIntrinsicNode::LL) {
3396       load_unsigned_byte(cnt1, Address(str2, result));
3397       load_unsigned_byte(result, Address(str1, result));
3398     } else if (ae == StrIntrinsicNode::UU) {
3399       load_unsigned_short(cnt1, Address(str2, result, scale));
3400       load_unsigned_short(result, Address(str1, result, scale));
3401     } else {
3402       load_unsigned_short(cnt1, Address(str2, result, scale2));
3403       load_unsigned_byte(result, Address(str1, result, scale1));
3404     }
3405     subl(result, cnt1);
3406     jmpb(POP_LABEL);
3407   }//if (VM_Version::supports_avx512vlbw())
3408 #endif // _LP64
3409 
3410   // Discard the stored length difference
3411   bind(POP_LABEL);
3412   pop(cnt1);
3413 
3414   // That's it
3415   bind(DONE_LABEL);
3416   if(ae == StrIntrinsicNode::UL) {
3417     negl(result);
3418   }
3419 
3420 }
3421 
3422 // Search for Non-ASCII character (Negative byte value) in a byte array,
3423 // return true if it has any and false otherwise.
3424 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3425 //   @IntrinsicCandidate
3426 //   private static boolean hasNegatives(byte[] ba, int off, int len) {
3427 //     for (int i = off; i < off + len; i++) {
3428 //       if (ba[i] < 0) {
3429 //         return true;
3430 //       }
3431 //     }
3432 //     return false;
3433 //   }
3434 void C2_MacroAssembler::has_negatives(Register ary1, Register len,
3435   Register result, Register tmp1,
3436   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3437   // rsi: byte array
3438   // rcx: len
3439   // rax: result
3440   ShortBranchVerifier sbv(this);
3441   assert_different_registers(ary1, len, result, tmp1);
3442   assert_different_registers(vec1, vec2);
3443   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3444 
3445   // len == 0
3446   testl(len, len);
3447   jcc(Assembler::zero, FALSE_LABEL);
3448 
3449   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3450     VM_Version::supports_avx512vlbw() &&
3451     VM_Version::supports_bmi2()) {
3452 
3453     Label test_64_loop, test_tail;
3454     Register tmp3_aliased = len;
3455 
3456     movl(tmp1, len);
3457     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3458 
3459     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
3460     andl(len, ~(64 - 1));    // vector count (in chars)
3461     jccb(Assembler::zero, test_tail);
3462 
3463     lea(ary1, Address(ary1, len, Address::times_1));
3464     negptr(len);
3465 
3466     bind(test_64_loop);
3467     // Check whether our 64 elements of size byte contain negatives
3468     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3469     kortestql(mask1, mask1);
3470     jcc(Assembler::notZero, TRUE_LABEL);
3471 
3472     addptr(len, 64);
3473     jccb(Assembler::notZero, test_64_loop);
3474 
3475 
3476     bind(test_tail);
3477     // bail out when there is nothing to be done
3478     testl(tmp1, -1);
3479     jcc(Assembler::zero, FALSE_LABEL);
3480 
3481     // ~(~0 << len) applied up to two times (for 32-bit scenario)
3482 #ifdef _LP64
3483     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3484     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3485     notq(tmp3_aliased);
3486     kmovql(mask2, tmp3_aliased);
3487 #else
3488     Label k_init;
3489     jmp(k_init);
3490 
3491     // We could not read 64-bits from a general purpose register thus we move
3492     // data required to compose 64 1's to the instruction stream
3493     // We emit 64 byte wide series of elements from 0..63 which later on would
3494     // be used as a compare targets with tail count contained in tmp1 register.
3495     // Result would be a k register having tmp1 consecutive number or 1
3496     // counting from least significant bit.
3497     address tmp = pc();
3498     emit_int64(0x0706050403020100);
3499     emit_int64(0x0F0E0D0C0B0A0908);
3500     emit_int64(0x1716151413121110);
3501     emit_int64(0x1F1E1D1C1B1A1918);
3502     emit_int64(0x2726252423222120);
3503     emit_int64(0x2F2E2D2C2B2A2928);
3504     emit_int64(0x3736353433323130);
3505     emit_int64(0x3F3E3D3C3B3A3938);
3506 
3507     bind(k_init);
3508     lea(len, InternalAddress(tmp));
3509     // create mask to test for negative byte inside a vector
3510     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3511     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
3512 
3513 #endif
3514     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3515     ktestq(mask1, mask2);
3516     jcc(Assembler::notZero, TRUE_LABEL);
3517 
3518     jmp(FALSE_LABEL);
3519   } else {
3520     movl(result, len); // copy
3521 
3522     if (UseAVX >= 2 && UseSSE >= 2) {
3523       // With AVX2, use 32-byte vector compare
3524       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3525 
3526       // Compare 32-byte vectors
3527       andl(result, 0x0000001f);  //   tail count (in bytes)
3528       andl(len, 0xffffffe0);   // vector count (in bytes)
3529       jccb(Assembler::zero, COMPARE_TAIL);
3530 
3531       lea(ary1, Address(ary1, len, Address::times_1));
3532       negptr(len);
3533 
3534       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
3535       movdl(vec2, tmp1);
3536       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
3537 
3538       bind(COMPARE_WIDE_VECTORS);
3539       vmovdqu(vec1, Address(ary1, len, Address::times_1));
3540       vptest(vec1, vec2);
3541       jccb(Assembler::notZero, TRUE_LABEL);
3542       addptr(len, 32);
3543       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3544 
3545       testl(result, result);
3546       jccb(Assembler::zero, FALSE_LABEL);
3547 
3548       vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3549       vptest(vec1, vec2);
3550       jccb(Assembler::notZero, TRUE_LABEL);
3551       jmpb(FALSE_LABEL);
3552 
3553       bind(COMPARE_TAIL); // len is zero
3554       movl(len, result);
3555       // Fallthru to tail compare
3556     } else if (UseSSE42Intrinsics) {
3557       // With SSE4.2, use double quad vector compare
3558       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3559 
3560       // Compare 16-byte vectors
3561       andl(result, 0x0000000f);  //   tail count (in bytes)
3562       andl(len, 0xfffffff0);   // vector count (in bytes)
3563       jcc(Assembler::zero, COMPARE_TAIL);
3564 
3565       lea(ary1, Address(ary1, len, Address::times_1));
3566       negptr(len);
3567 
3568       movl(tmp1, 0x80808080);
3569       movdl(vec2, tmp1);
3570       pshufd(vec2, vec2, 0);
3571 
3572       bind(COMPARE_WIDE_VECTORS);
3573       movdqu(vec1, Address(ary1, len, Address::times_1));
3574       ptest(vec1, vec2);
3575       jcc(Assembler::notZero, TRUE_LABEL);
3576       addptr(len, 16);
3577       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3578 
3579       testl(result, result);
3580       jcc(Assembler::zero, FALSE_LABEL);
3581 
3582       movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3583       ptest(vec1, vec2);
3584       jccb(Assembler::notZero, TRUE_LABEL);
3585       jmpb(FALSE_LABEL);
3586 
3587       bind(COMPARE_TAIL); // len is zero
3588       movl(len, result);
3589       // Fallthru to tail compare
3590     }
3591   }
3592   // Compare 4-byte vectors
3593   andl(len, 0xfffffffc); // vector count (in bytes)
3594   jccb(Assembler::zero, COMPARE_CHAR);
3595 
3596   lea(ary1, Address(ary1, len, Address::times_1));
3597   negptr(len);
3598 
3599   bind(COMPARE_VECTORS);
3600   movl(tmp1, Address(ary1, len, Address::times_1));
3601   andl(tmp1, 0x80808080);
3602   jccb(Assembler::notZero, TRUE_LABEL);
3603   addptr(len, 4);
3604   jcc(Assembler::notZero, COMPARE_VECTORS);
3605 
3606   // Compare trailing char (final 2 bytes), if any
3607   bind(COMPARE_CHAR);
3608   testl(result, 0x2);   // tail  char
3609   jccb(Assembler::zero, COMPARE_BYTE);
3610   load_unsigned_short(tmp1, Address(ary1, 0));
3611   andl(tmp1, 0x00008080);
3612   jccb(Assembler::notZero, TRUE_LABEL);
3613   subptr(result, 2);
3614   lea(ary1, Address(ary1, 2));
3615 
3616   bind(COMPARE_BYTE);
3617   testl(result, 0x1);   // tail  byte
3618   jccb(Assembler::zero, FALSE_LABEL);
3619   load_unsigned_byte(tmp1, Address(ary1, 0));
3620   andl(tmp1, 0x00000080);
3621   jccb(Assembler::notEqual, TRUE_LABEL);
3622   jmpb(FALSE_LABEL);
3623 
3624   bind(TRUE_LABEL);
3625   movl(result, 1);   // return true
3626   jmpb(DONE);
3627 
3628   bind(FALSE_LABEL);
3629   xorl(result, result); // return false
3630 
3631   // That's it
3632   bind(DONE);
3633   if (UseAVX >= 2 && UseSSE >= 2) {
3634     // clean upper bits of YMM registers
3635     vpxor(vec1, vec1);
3636     vpxor(vec2, vec2);
3637   }
3638 }
3639 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
3640 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
3641                                       Register limit, Register result, Register chr,
3642                                       XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
3643   ShortBranchVerifier sbv(this);
3644   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
3645 
3646   int length_offset  = arrayOopDesc::length_offset_in_bytes();
3647   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
3648 
3649   if (is_array_equ) {
3650     // Check the input args
3651     cmpoop(ary1, ary2);
3652     jcc(Assembler::equal, TRUE_LABEL);
3653 
3654     // Need additional checks for arrays_equals.
3655     testptr(ary1, ary1);
3656     jcc(Assembler::zero, FALSE_LABEL);
3657     testptr(ary2, ary2);
3658     jcc(Assembler::zero, FALSE_LABEL);
3659 
3660     // Check the lengths
3661     movl(limit, Address(ary1, length_offset));
3662     cmpl(limit, Address(ary2, length_offset));
3663     jcc(Assembler::notEqual, FALSE_LABEL);
3664   }
3665 
3666   // count == 0
3667   testl(limit, limit);
3668   jcc(Assembler::zero, TRUE_LABEL);
3669 
3670   if (is_array_equ) {
3671     // Load array address
3672     lea(ary1, Address(ary1, base_offset));
3673     lea(ary2, Address(ary2, base_offset));
3674   }
3675 
3676   if (is_array_equ && is_char) {
3677     // arrays_equals when used for char[].
3678     shll(limit, 1);      // byte count != 0
3679   }
3680   movl(result, limit); // copy
3681 
3682   if (UseAVX >= 2) {
3683     // With AVX2, use 32-byte vector compare
3684     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3685 
3686     // Compare 32-byte vectors
3687     andl(result, 0x0000001f);  //   tail count (in bytes)
3688     andl(limit, 0xffffffe0);   // vector count (in bytes)
3689     jcc(Assembler::zero, COMPARE_TAIL);
3690 
3691     lea(ary1, Address(ary1, limit, Address::times_1));
3692     lea(ary2, Address(ary2, limit, Address::times_1));
3693     negptr(limit);
3694 
3695 #ifdef _LP64
3696     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3697       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
3698 
3699       cmpl(limit, -64);
3700       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3701 
3702       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3703 
3704       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
3705       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
3706       kortestql(mask, mask);
3707       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3708       addptr(limit, 64);  // update since we already compared at this addr
3709       cmpl(limit, -64);
3710       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3711 
3712       // At this point we may still need to compare -limit+result bytes.
3713       // We could execute the next two instruction and just continue via non-wide path:
3714       //  cmpl(limit, 0);
3715       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
3716       // But since we stopped at the points ary{1,2}+limit which are
3717       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
3718       // (|limit| <= 32 and result < 32),
3719       // we may just compare the last 64 bytes.
3720       //
3721       addptr(result, -64);   // it is safe, bc we just came from this area
3722       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
3723       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
3724       kortestql(mask, mask);
3725       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3726 
3727       jmp(TRUE_LABEL);
3728 
3729       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3730 
3731     }//if (VM_Version::supports_avx512vlbw())
3732 #endif //_LP64
3733     bind(COMPARE_WIDE_VECTORS);
3734     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
3735     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
3736     vpxor(vec1, vec2);
3737 
3738     vptest(vec1, vec1);
3739     jcc(Assembler::notZero, FALSE_LABEL);
3740     addptr(limit, 32);
3741     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3742 
3743     testl(result, result);
3744     jcc(Assembler::zero, TRUE_LABEL);
3745 
3746     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3747     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
3748     vpxor(vec1, vec2);
3749 
3750     vptest(vec1, vec1);
3751     jccb(Assembler::notZero, FALSE_LABEL);
3752     jmpb(TRUE_LABEL);
3753 
3754     bind(COMPARE_TAIL); // limit is zero
3755     movl(limit, result);
3756     // Fallthru to tail compare
3757   } else if (UseSSE42Intrinsics) {
3758     // With SSE4.2, use double quad vector compare
3759     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3760 
3761     // Compare 16-byte vectors
3762     andl(result, 0x0000000f);  //   tail count (in bytes)
3763     andl(limit, 0xfffffff0);   // vector count (in bytes)
3764     jcc(Assembler::zero, COMPARE_TAIL);
3765 
3766     lea(ary1, Address(ary1, limit, Address::times_1));
3767     lea(ary2, Address(ary2, limit, Address::times_1));
3768     negptr(limit);
3769 
3770     bind(COMPARE_WIDE_VECTORS);
3771     movdqu(vec1, Address(ary1, limit, Address::times_1));
3772     movdqu(vec2, Address(ary2, limit, Address::times_1));
3773     pxor(vec1, vec2);
3774 
3775     ptest(vec1, vec1);
3776     jcc(Assembler::notZero, FALSE_LABEL);
3777     addptr(limit, 16);
3778     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3779 
3780     testl(result, result);
3781     jcc(Assembler::zero, TRUE_LABEL);
3782 
3783     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3784     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
3785     pxor(vec1, vec2);
3786 
3787     ptest(vec1, vec1);
3788     jccb(Assembler::notZero, FALSE_LABEL);
3789     jmpb(TRUE_LABEL);
3790 
3791     bind(COMPARE_TAIL); // limit is zero
3792     movl(limit, result);
3793     // Fallthru to tail compare
3794   }
3795 
3796   // Compare 4-byte vectors
3797   andl(limit, 0xfffffffc); // vector count (in bytes)
3798   jccb(Assembler::zero, COMPARE_CHAR);
3799 
3800   lea(ary1, Address(ary1, limit, Address::times_1));
3801   lea(ary2, Address(ary2, limit, Address::times_1));
3802   negptr(limit);
3803 
3804   bind(COMPARE_VECTORS);
3805   movl(chr, Address(ary1, limit, Address::times_1));
3806   cmpl(chr, Address(ary2, limit, Address::times_1));
3807   jccb(Assembler::notEqual, FALSE_LABEL);
3808   addptr(limit, 4);
3809   jcc(Assembler::notZero, COMPARE_VECTORS);
3810 
3811   // Compare trailing char (final 2 bytes), if any
3812   bind(COMPARE_CHAR);
3813   testl(result, 0x2);   // tail  char
3814   jccb(Assembler::zero, COMPARE_BYTE);
3815   load_unsigned_short(chr, Address(ary1, 0));
3816   load_unsigned_short(limit, Address(ary2, 0));
3817   cmpl(chr, limit);
3818   jccb(Assembler::notEqual, FALSE_LABEL);
3819 
3820   if (is_array_equ && is_char) {
3821     bind(COMPARE_BYTE);
3822   } else {
3823     lea(ary1, Address(ary1, 2));
3824     lea(ary2, Address(ary2, 2));
3825 
3826     bind(COMPARE_BYTE);
3827     testl(result, 0x1);   // tail  byte
3828     jccb(Assembler::zero, TRUE_LABEL);
3829     load_unsigned_byte(chr, Address(ary1, 0));
3830     load_unsigned_byte(limit, Address(ary2, 0));
3831     cmpl(chr, limit);
3832     jccb(Assembler::notEqual, FALSE_LABEL);
3833   }
3834   bind(TRUE_LABEL);
3835   movl(result, 1);   // return true
3836   jmpb(DONE);
3837 
3838   bind(FALSE_LABEL);
3839   xorl(result, result); // return false
3840 
3841   // That's it
3842   bind(DONE);
3843   if (UseAVX >= 2) {
3844     // clean upper bits of YMM registers
3845     vpxor(vec1, vec1);
3846     vpxor(vec2, vec2);
3847   }
3848 }
3849 
3850 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
3851                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
3852   switch(ideal_opc) {
3853     case Op_LShiftVS:
3854       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
3855     case Op_LShiftVI:
3856       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
3857     case Op_LShiftVL:
3858       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
3859     case Op_RShiftVS:
3860       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
3861     case Op_RShiftVI:
3862       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
3863     case Op_RShiftVL:
3864       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
3865     case Op_URShiftVS:
3866       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
3867     case Op_URShiftVI:
3868       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
3869     case Op_URShiftVL:
3870       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
3871     case Op_RotateRightV:
3872       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
3873     case Op_RotateLeftV:
3874       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
3875     default:
3876       fatal("Unsupported masked operation"); break;
3877   }
3878 }
3879 
3880 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
3881                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
3882                                     bool is_varshift) {
3883   switch (ideal_opc) {
3884     case Op_AddVB:
3885       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
3886     case Op_AddVS:
3887       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
3888     case Op_AddVI:
3889       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
3890     case Op_AddVL:
3891       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
3892     case Op_AddVF:
3893       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
3894     case Op_AddVD:
3895       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
3896     case Op_SubVB:
3897       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
3898     case Op_SubVS:
3899       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
3900     case Op_SubVI:
3901       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
3902     case Op_SubVL:
3903       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
3904     case Op_SubVF:
3905       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
3906     case Op_SubVD:
3907       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
3908     case Op_MulVS:
3909       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
3910     case Op_MulVI:
3911       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
3912     case Op_MulVL:
3913       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
3914     case Op_MulVF:
3915       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
3916     case Op_MulVD:
3917       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
3918     case Op_DivVF:
3919       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
3920     case Op_DivVD:
3921       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
3922     case Op_SqrtVF:
3923       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
3924     case Op_SqrtVD:
3925       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
3926     case Op_AbsVB:
3927       evpabsb(dst, mask, src2, merge, vlen_enc); break;
3928     case Op_AbsVS:
3929       evpabsw(dst, mask, src2, merge, vlen_enc); break;
3930     case Op_AbsVI:
3931       evpabsd(dst, mask, src2, merge, vlen_enc); break;
3932     case Op_AbsVL:
3933       evpabsq(dst, mask, src2, merge, vlen_enc); break;
3934     case Op_FmaVF:
3935       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
3936     case Op_FmaVD:
3937       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
3938     case Op_VectorRearrange:
3939       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
3940     case Op_LShiftVS:
3941       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3942     case Op_LShiftVI:
3943       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3944     case Op_LShiftVL:
3945       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3946     case Op_RShiftVS:
3947       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3948     case Op_RShiftVI:
3949       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3950     case Op_RShiftVL:
3951       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3952     case Op_URShiftVS:
3953       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3954     case Op_URShiftVI:
3955       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3956     case Op_URShiftVL:
3957       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3958     case Op_RotateLeftV:
3959       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
3960     case Op_RotateRightV:
3961       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
3962     case Op_MaxV:
3963       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
3964     case Op_MinV:
3965       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
3966     case Op_XorV:
3967       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
3968     case Op_OrV:
3969       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
3970     case Op_AndV:
3971       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
3972     default:
3973       fatal("Unsupported masked operation"); break;
3974   }
3975 }
3976 
3977 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
3978                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
3979   switch (ideal_opc) {
3980     case Op_AddVB:
3981       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
3982     case Op_AddVS:
3983       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
3984     case Op_AddVI:
3985       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
3986     case Op_AddVL:
3987       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
3988     case Op_AddVF:
3989       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
3990     case Op_AddVD:
3991       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
3992     case Op_SubVB:
3993       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
3994     case Op_SubVS:
3995       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
3996     case Op_SubVI:
3997       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
3998     case Op_SubVL:
3999       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4000     case Op_SubVF:
4001       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4002     case Op_SubVD:
4003       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4004     case Op_MulVS:
4005       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4006     case Op_MulVI:
4007       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4008     case Op_MulVL:
4009       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4010     case Op_MulVF:
4011       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4012     case Op_MulVD:
4013       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4014     case Op_DivVF:
4015       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4016     case Op_DivVD:
4017       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4018     case Op_FmaVF:
4019       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4020     case Op_FmaVD:
4021       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4022     case Op_MaxV:
4023       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4024     case Op_MinV:
4025       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4026     case Op_XorV:
4027       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4028     case Op_OrV:
4029       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4030     case Op_AndV:
4031       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4032     default:
4033       fatal("Unsupported masked operation"); break;
4034   }
4035 }
4036 
4037 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4038                                   KRegister src1, KRegister src2) {
4039   BasicType etype = T_ILLEGAL;
4040   switch(mask_len) {
4041     case 2:
4042     case 4:
4043     case 8:  etype = T_BYTE; break;
4044     case 16: etype = T_SHORT; break;
4045     case 32: etype = T_INT; break;
4046     case 64: etype = T_LONG; break;
4047     default: fatal("Unsupported type"); break;
4048   }
4049   assert(etype != T_ILLEGAL, "");
4050   switch(ideal_opc) {
4051     case Op_AndVMask:
4052       kand(etype, dst, src1, src2); break;
4053     case Op_OrVMask:
4054       kor(etype, dst, src1, src2); break;
4055     case Op_XorVMask:
4056       kxor(etype, dst, src1, src2); break;
4057     default:
4058       fatal("Unsupported masked operation"); break;
4059   }
4060 }
4061 
4062 #ifdef _LP64
4063 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask,
4064                                               Register tmp, int masklen, int masksize,
4065                                               int vec_enc) {
4066   if(VM_Version::supports_avx512bw()) {
4067     kmovql(tmp, mask);
4068   } else {
4069     assert(masklen <= 16, "");
4070     kmovwl(tmp, mask);
4071   }
4072   if (masksize < 16) {
4073     andq(tmp, (((jlong)1 << masklen) - 1));
4074   }
4075   switch(opc) {
4076     case Op_VectorMaskTrueCount:
4077       popcntq(dst, tmp);
4078       break;
4079     case Op_VectorMaskLastTrue:
4080       mov64(dst, -1);
4081       bsrq(tmp, tmp);
4082       cmov(Assembler::notZero, dst, tmp);
4083       break;
4084     case Op_VectorMaskFirstTrue:
4085       mov64(dst, masklen);
4086       bsfq(tmp, tmp);
4087       cmov(Assembler::notZero, dst, tmp);
4088       break;
4089     default: assert(false, "Unhandled mask operation");
4090   }
4091 }
4092 
4093 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
4094                                               XMMRegister xtmp1, Register tmp, int masklen, int masksize,
4095                                               int vec_enc) {
4096   assert(VM_Version::supports_avx(), "");
4097   vpxor(xtmp, xtmp, xtmp, vec_enc);
4098   vpsubb(xtmp, xtmp, mask, vec_enc);
4099   vpmovmskb(tmp, xtmp, vec_enc);
4100   if (masksize < 16) {
4101     andq(tmp, (((jlong)1 << masklen) - 1));
4102   }
4103   switch(opc) {
4104     case Op_VectorMaskTrueCount:
4105       popcntq(dst, tmp);
4106       break;
4107     case Op_VectorMaskLastTrue:
4108       mov64(dst, -1);
4109       bsrq(tmp, tmp);
4110       cmov(Assembler::notZero, dst, tmp);
4111       break;
4112     case Op_VectorMaskFirstTrue:
4113       mov64(dst, masklen);
4114       bsfq(tmp, tmp);
4115       cmov(Assembler::notZero, dst, tmp);
4116       break;
4117     default: assert(false, "Unhandled mask operation");
4118   }
4119 }

































































4120 #endif
--- EOF ---