1 /*
   2  * Copyright (c) 2020, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "oops/methodData.hpp"
  29 #include "opto/c2_MacroAssembler.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/opcodes.hpp"
  32 #include "opto/subnode.hpp"
  33 #include "runtime/objectMonitor.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 
  36 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
  37   switch (vlen_in_bytes) {
  38     case  4: // fall-through
  39     case  8: // fall-through
  40     case 16: return Assembler::AVX_128bit;
  41     case 32: return Assembler::AVX_256bit;
  42     case 64: return Assembler::AVX_512bit;
  43 
  44     default: {
  45       ShouldNotReachHere();
  46       return Assembler::AVX_NoVec;
  47     }
  48   }
  49 }
  50 
  51 #if INCLUDE_RTM_OPT
  52 
  53 // Update rtm_counters based on abort status
  54 // input: abort_status
  55 //        rtm_counters (RTMLockingCounters*)
  56 // flags are killed
  57 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
  58 
  59   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
  60   if (PrintPreciseRTMLockingStatistics) {
  61     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
  62       Label check_abort;
  63       testl(abort_status, (1<<i));
  64       jccb(Assembler::equal, check_abort);
  65       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
  66       bind(check_abort);
  67     }
  68   }
  69 }
  70 
  71 // Branch if (random & (count-1) != 0), count is 2^n
  72 // tmp, scr and flags are killed
  73 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
  74   assert(tmp == rax, "");
  75   assert(scr == rdx, "");
  76   rdtsc(); // modifies EDX:EAX
  77   andptr(tmp, count-1);
  78   jccb(Assembler::notZero, brLabel);
  79 }
  80 
  81 // Perform abort ratio calculation, set no_rtm bit if high ratio
  82 // input:  rtm_counters_Reg (RTMLockingCounters* address)
  83 // tmpReg, rtm_counters_Reg and flags are killed
  84 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
  85                                                     Register rtm_counters_Reg,
  86                                                     RTMLockingCounters* rtm_counters,
  87                                                     Metadata* method_data) {
  88   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
  89 
  90   if (RTMLockingCalculationDelay > 0) {
  91     // Delay calculation
  92     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
  93     testptr(tmpReg, tmpReg);
  94     jccb(Assembler::equal, L_done);
  95   }
  96   // Abort ratio calculation only if abort_count > RTMAbortThreshold
  97   //   Aborted transactions = abort_count * 100
  98   //   All transactions = total_count *  RTMTotalCountIncrRate
  99   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 100 
 101   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 102   cmpptr(tmpReg, RTMAbortThreshold);
 103   jccb(Assembler::below, L_check_always_rtm2);
 104   imulptr(tmpReg, tmpReg, 100);
 105 
 106   Register scrReg = rtm_counters_Reg;
 107   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 108   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 109   imulptr(scrReg, scrReg, RTMAbortRatio);
 110   cmpptr(tmpReg, scrReg);
 111   jccb(Assembler::below, L_check_always_rtm1);
 112   if (method_data != NULL) {
 113     // set rtm_state to "no rtm" in MDO
 114     mov_metadata(tmpReg, method_data);
 115     lock();
 116     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
 117   }
 118   jmpb(L_done);
 119   bind(L_check_always_rtm1);
 120   // Reload RTMLockingCounters* address
 121   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 122   bind(L_check_always_rtm2);
 123   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 124   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 125   jccb(Assembler::below, L_done);
 126   if (method_data != NULL) {
 127     // set rtm_state to "always rtm" in MDO
 128     mov_metadata(tmpReg, method_data);
 129     lock();
 130     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
 131   }
 132   bind(L_done);
 133 }
 134 
 135 // Update counters and perform abort ratio calculation
 136 // input:  abort_status_Reg
 137 // rtm_counters_Reg, flags are killed
 138 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 139                                       Register rtm_counters_Reg,
 140                                       RTMLockingCounters* rtm_counters,
 141                                       Metadata* method_data,
 142                                       bool profile_rtm) {
 143 
 144   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 145   // update rtm counters based on rax value at abort
 146   // reads abort_status_Reg, updates flags
 147   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 148   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 149   if (profile_rtm) {
 150     // Save abort status because abort_status_Reg is used by following code.
 151     if (RTMRetryCount > 0) {
 152       push(abort_status_Reg);
 153     }
 154     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 155     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 156     // restore abort status
 157     if (RTMRetryCount > 0) {
 158       pop(abort_status_Reg);
 159     }
 160   }
 161 }
 162 
 163 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 164 // inputs: retry_count_Reg
 165 //       : abort_status_Reg
 166 // output: retry_count_Reg decremented by 1
 167 // flags are killed
 168 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 169   Label doneRetry;
 170   assert(abort_status_Reg == rax, "");
 171   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 172   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 173   // if reason is in 0x6 and retry count != 0 then retry
 174   andptr(abort_status_Reg, 0x6);
 175   jccb(Assembler::zero, doneRetry);
 176   testl(retry_count_Reg, retry_count_Reg);
 177   jccb(Assembler::zero, doneRetry);
 178   pause();
 179   decrementl(retry_count_Reg);
 180   jmp(retryLabel);
 181   bind(doneRetry);
 182 }
 183 
 184 // Spin and retry if lock is busy,
 185 // inputs: box_Reg (monitor address)
 186 //       : retry_count_Reg
 187 // output: retry_count_Reg decremented by 1
 188 //       : clear z flag if retry count exceeded
 189 // tmp_Reg, scr_Reg, flags are killed
 190 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 191                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 192   Label SpinLoop, SpinExit, doneRetry;
 193   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 194 
 195   testl(retry_count_Reg, retry_count_Reg);
 196   jccb(Assembler::zero, doneRetry);
 197   decrementl(retry_count_Reg);
 198   movptr(scr_Reg, RTMSpinLoopCount);
 199 
 200   bind(SpinLoop);
 201   pause();
 202   decrementl(scr_Reg);
 203   jccb(Assembler::lessEqual, SpinExit);
 204   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 205   testptr(tmp_Reg, tmp_Reg);
 206   jccb(Assembler::notZero, SpinLoop);
 207 
 208   bind(SpinExit);
 209   jmp(retryLabel);
 210   bind(doneRetry);
 211   incrementl(retry_count_Reg); // clear z flag
 212 }
 213 
 214 // Use RTM for normal stack locks
 215 // Input: objReg (object to lock)
 216 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 217                                          Register retry_on_abort_count_Reg,
 218                                          RTMLockingCounters* stack_rtm_counters,
 219                                          Metadata* method_data, bool profile_rtm,
 220                                          Label& DONE_LABEL, Label& IsInflated) {
 221   assert(UseRTMForStackLocks, "why call this otherwise?");
 222   assert(tmpReg == rax, "");
 223   assert(scrReg == rdx, "");
 224   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 225 
 226   if (RTMRetryCount > 0) {
 227     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 228     bind(L_rtm_retry);
 229   }
 230   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 231   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral
 232   jcc(Assembler::notZero, IsInflated);
 233 
 234   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 235     Label L_noincrement;
 236     if (RTMTotalCountIncrRate > 1) {
 237       // tmpReg, scrReg and flags are killed
 238       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 239     }
 240     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
 241     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 242     bind(L_noincrement);
 243   }
 244   xbegin(L_on_abort);
 245   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 246   andptr(tmpReg, markWord::lock_mask_in_place);     // look at 2 lock bits
 247   cmpptr(tmpReg, markWord::unlocked_value);         // bits = 01 unlocked
 248   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 249 
 250   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 251   if (UseRTMXendForLockBusy) {
 252     xend();
 253     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 254     jmp(L_decrement_retry);
 255   }
 256   else {
 257     xabort(0);
 258   }
 259   bind(L_on_abort);
 260   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 261     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 262   }
 263   bind(L_decrement_retry);
 264   if (RTMRetryCount > 0) {
 265     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 266     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 267   }
 268 }
 269 
 270 // Use RTM for inflating locks
 271 // inputs: objReg (object to lock)
 272 //         boxReg (on-stack box address (displaced header location) - KILLED)
 273 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 274 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 275                                             Register scrReg, Register retry_on_busy_count_Reg,
 276                                             Register retry_on_abort_count_Reg,
 277                                             RTMLockingCounters* rtm_counters,
 278                                             Metadata* method_data, bool profile_rtm,
 279                                             Label& DONE_LABEL) {
 280   assert(UseRTMLocking, "why call this otherwise?");
 281   assert(tmpReg == rax, "");
 282   assert(scrReg == rdx, "");
 283   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 284   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 285 
 286   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 287   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
 288   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 289 
 290   if (RTMRetryCount > 0) {
 291     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 292     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 293     bind(L_rtm_retry);
 294   }
 295   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 296     Label L_noincrement;
 297     if (RTMTotalCountIncrRate > 1) {
 298       // tmpReg, scrReg and flags are killed
 299       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 300     }
 301     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 302     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 303     bind(L_noincrement);
 304   }
 305   xbegin(L_on_abort);
 306   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 307   movptr(tmpReg, Address(tmpReg, owner_offset));
 308   testptr(tmpReg, tmpReg);
 309   jcc(Assembler::zero, DONE_LABEL);
 310   if (UseRTMXendForLockBusy) {
 311     xend();
 312     jmp(L_decrement_retry);
 313   }
 314   else {
 315     xabort(0);
 316   }
 317   bind(L_on_abort);
 318   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 319   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 320     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 321   }
 322   if (RTMRetryCount > 0) {
 323     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 324     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 325   }
 326 
 327   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 328   testptr(tmpReg, tmpReg) ;
 329   jccb(Assembler::notZero, L_decrement_retry) ;
 330 
 331   // Appears unlocked - try to swing _owner from null to non-null.
 332   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 333 #ifdef _LP64
 334   Register threadReg = r15_thread;
 335 #else
 336   get_thread(scrReg);
 337   Register threadReg = scrReg;
 338 #endif
 339   lock();
 340   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 341 
 342   if (RTMRetryCount > 0) {
 343     // success done else retry
 344     jccb(Assembler::equal, DONE_LABEL) ;
 345     bind(L_decrement_retry);
 346     // Spin and retry if lock is busy.
 347     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 348   }
 349   else {
 350     bind(L_decrement_retry);
 351   }
 352 }
 353 
 354 #endif //  INCLUDE_RTM_OPT
 355 
 356 // fast_lock and fast_unlock used by C2
 357 
 358 // Because the transitions from emitted code to the runtime
 359 // monitorenter/exit helper stubs are so slow it's critical that
 360 // we inline both the stack-locking fast path and the inflated fast path.
 361 //
 362 // See also: cmpFastLock and cmpFastUnlock.
 363 //
 364 // What follows is a specialized inline transliteration of the code
 365 // in enter() and exit(). If we're concerned about I$ bloat another
 366 // option would be to emit TrySlowEnter and TrySlowExit methods
 367 // at startup-time.  These methods would accept arguments as
 368 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 369 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 370 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 371 // In practice, however, the # of lock sites is bounded and is usually small.
 372 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 373 // if the processor uses simple bimodal branch predictors keyed by EIP
 374 // Since the helper routines would be called from multiple synchronization
 375 // sites.
 376 //
 377 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 378 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 379 // to those specialized methods.  That'd give us a mostly platform-independent
 380 // implementation that the JITs could optimize and inline at their pleasure.
 381 // Done correctly, the only time we'd need to cross to native could would be
 382 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 383 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 384 // (b) explicit barriers or fence operations.
 385 //
 386 // TODO:
 387 //
 388 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 389 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 390 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 391 //    the lock operators would typically be faster than reifying Self.
 392 //
 393 // *  Ideally I'd define the primitives as:
 394 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 395 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 396 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 397 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 398 //    Furthermore the register assignments are overconstrained, possibly resulting in
 399 //    sub-optimal code near the synchronization site.
 400 //
 401 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 402 //    Alternately, use a better sp-proximity test.
 403 //
 404 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 405 //    Either one is sufficient to uniquely identify a thread.
 406 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 407 //
 408 // *  Intrinsify notify() and notifyAll() for the common cases where the
 409 //    object is locked by the calling thread but the waitlist is empty.
 410 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 411 //
 412 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 413 //    But beware of excessive branch density on AMD Opterons.
 414 //
 415 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 416 //    or failure of the fast path.  If the fast path fails then we pass
 417 //    control to the slow path, typically in C.  In fast_lock and
 418 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 419 //    will emit a conditional branch immediately after the node.
 420 //    So we have branches to branches and lots of ICC.ZF games.
 421 //    Instead, it might be better to have C2 pass a "FailureLabel"
 422 //    into fast_lock and fast_unlock.  In the case of success, control
 423 //    will drop through the node.  ICC.ZF is undefined at exit.
 424 //    In the case of failure, the node will branch directly to the
 425 //    FailureLabel
 426 
 427 
 428 // obj: object to lock
 429 // box: on-stack box address (displaced header location) - KILLED
 430 // rax,: tmp -- KILLED
 431 // scr: tmp -- KILLED
 432 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 433                                  Register scrReg, Register cx1Reg, Register cx2Reg,
 434                                  RTMLockingCounters* rtm_counters,
 435                                  RTMLockingCounters* stack_rtm_counters,
 436                                  Metadata* method_data,
 437                                  bool use_rtm, bool profile_rtm) {
 438   // Ensure the register assignments are disjoint
 439   assert(tmpReg == rax, "");
 440 
 441   if (use_rtm) {
 442     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 443   } else {
 444     assert(cx2Reg == noreg, "");
 445     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 446   }
 447 
 448   // Possible cases that we'll encounter in fast_lock
 449   // ------------------------------------------------
 450   // * Inflated
 451   //    -- unlocked
 452   //    -- Locked
 453   //       = by self
 454   //       = by other
 455   // * neutral
 456   // * stack-locked
 457   //    -- by self
 458   //       = sp-proximity test hits
 459   //       = sp-proximity test generates false-negative
 460   //    -- by other
 461   //
 462 
 463   Label IsInflated, DONE_LABEL;
 464 
 465   if (DiagnoseSyncOnValueBasedClasses != 0) {
 466     load_klass(tmpReg, objReg, cx1Reg);
 467     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 468     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 469     jcc(Assembler::notZero, DONE_LABEL);
 470   }
 471 
 472 #if INCLUDE_RTM_OPT
 473   if (UseRTMForStackLocks && use_rtm) {
 474     assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive");
 475     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 476                       stack_rtm_counters, method_data, profile_rtm,
 477                       DONE_LABEL, IsInflated);
 478   }
 479 #endif // INCLUDE_RTM_OPT
 480 
 481   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 482   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 483   jccb(Assembler::notZero, IsInflated);
 484 
 485   if (!UseHeavyMonitors) {
 486     // Attempt stack-locking ...
 487     orptr (tmpReg, markWord::unlocked_value);
 488     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 489     lock();
 490     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 491     jcc(Assembler::equal, DONE_LABEL);           // Success
 492 
 493     // Recursive locking.
 494     // The object is stack-locked: markword contains stack pointer to BasicLock.
 495     // Locked by current thread if difference with current SP is less than one page.
 496     subptr(tmpReg, rsp);
 497     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 498     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
 499     movptr(Address(boxReg, 0), tmpReg);
 500   } else {
 501     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 502     testptr(objReg, objReg);
 503   }
 504   jmp(DONE_LABEL);
 505 
 506   bind(IsInflated);
 507   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 508 
 509 #if INCLUDE_RTM_OPT
 510   // Use the same RTM locking code in 32- and 64-bit VM.
 511   if (use_rtm) {
 512     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 513                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 514   } else {
 515 #endif // INCLUDE_RTM_OPT
 516 
 517 #ifndef _LP64
 518   // The object is inflated.
 519 
 520   // boxReg refers to the on-stack BasicLock in the current frame.
 521   // We'd like to write:
 522   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 523   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 524   // additional latency as we have another ST in the store buffer that must drain.
 525 
 526   // avoid ST-before-CAS
 527   // register juggle because we need tmpReg for cmpxchgptr below
 528   movptr(scrReg, boxReg);
 529   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 530 
 531   // Optimistic form: consider XORL tmpReg,tmpReg
 532   movptr(tmpReg, NULL_WORD);
 533 
 534   // Appears unlocked - try to swing _owner from null to non-null.
 535   // Ideally, I'd manifest "Self" with get_thread and then attempt
 536   // to CAS the register containing Self into m->Owner.
 537   // But we don't have enough registers, so instead we can either try to CAS
 538   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 539   // we later store "Self" into m->Owner.  Transiently storing a stack address
 540   // (rsp or the address of the box) into  m->owner is harmless.
 541   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 542   lock();
 543   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 544   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 545   // If we weren't able to swing _owner from NULL to the BasicLock
 546   // then take the slow path.
 547   jccb  (Assembler::notZero, DONE_LABEL);
 548   // update _owner from BasicLock to thread
 549   get_thread (scrReg);                    // beware: clobbers ICCs
 550   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 551   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 552 
 553   // If the CAS fails we can either retry or pass control to the slow path.
 554   // We use the latter tactic.
 555   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 556   // If the CAS was successful ...
 557   //   Self has acquired the lock
 558   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 559   // Intentional fall-through into DONE_LABEL ...
 560 #else // _LP64
 561   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 562   movq(scrReg, tmpReg);
 563   xorq(tmpReg, tmpReg);
 564   lock();
 565   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 566   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 567   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 568   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
 569   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 570   jcc(Assembler::equal, DONE_LABEL);           // CAS above succeeded; propagate ZF = 1 (success)
 571 
 572   cmpptr(r15_thread, rax);                     // Check if we are already the owner (recursive lock)
 573   jcc(Assembler::notEqual, DONE_LABEL);        // If not recursive, ZF = 0 at this point (fail)
 574   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 575   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 576 #endif // _LP64
 577 #if INCLUDE_RTM_OPT
 578   } // use_rtm()
 579 #endif
 580   // DONE_LABEL is a hot target - we'd really like to place it at the
 581   // start of cache line by padding with NOPs.
 582   // See the AMD and Intel software optimization manuals for the
 583   // most efficient "long" NOP encodings.
 584   // Unfortunately none of our alignment mechanisms suffice.
 585   bind(DONE_LABEL);
 586 
 587   // At DONE_LABEL the icc ZFlag is set as follows ...
 588   // fast_unlock uses the same protocol.
 589   // ZFlag == 1 -> Success
 590   // ZFlag == 0 -> Failure - force control through the slow path
 591 }
 592 
 593 // obj: object to unlock
 594 // box: box address (displaced header location), killed.  Must be EAX.
 595 // tmp: killed, cannot be obj nor box.
 596 //
 597 // Some commentary on balanced locking:
 598 //
 599 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 600 // Methods that don't have provably balanced locking are forced to run in the
 601 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 602 // The interpreter provides two properties:
 603 // I1:  At return-time the interpreter automatically and quietly unlocks any
 604 //      objects acquired the current activation (frame).  Recall that the
 605 //      interpreter maintains an on-stack list of locks currently held by
 606 //      a frame.
 607 // I2:  If a method attempts to unlock an object that is not held by the
 608 //      the frame the interpreter throws IMSX.
 609 //
 610 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 611 // B() doesn't have provably balanced locking so it runs in the interpreter.
 612 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 613 // is still locked by A().
 614 //
 615 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 616 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 617 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 618 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 619 // Arguably given that the spec legislates the JNI case as undefined our implementation
 620 // could reasonably *avoid* checking owner in fast_unlock().
 621 // In the interest of performance we elide m->Owner==Self check in unlock.
 622 // A perfectly viable alternative is to elide the owner check except when
 623 // Xcheck:jni is enabled.
 624 
 625 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 626   assert(boxReg == rax, "");
 627   assert_different_registers(objReg, boxReg, tmpReg);
 628 
 629   Label DONE_LABEL, Stacked, CheckSucc;
 630 
 631 #if INCLUDE_RTM_OPT
 632   if (UseRTMForStackLocks && use_rtm) {
 633     assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive");
 634     Label L_regular_unlock;
 635     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 636     andptr(tmpReg, markWord::lock_mask_in_place);                     // look at 2 lock bits
 637     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 01 unlocked
 638     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 639     xend();                                                           // otherwise end...
 640     jmp(DONE_LABEL);                                                  // ... and we're done
 641     bind(L_regular_unlock);
 642   }
 643 #endif
 644 
 645   if (!UseHeavyMonitors) {
 646     cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD);                   // Examine the displaced header
 647     jcc   (Assembler::zero, DONE_LABEL);                              // 0 indicates recursive stack-lock
 648   }
 649   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
 650   if (!UseHeavyMonitors) {
 651     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 652     jccb  (Assembler::zero, Stacked);
 653   }
 654 
 655   // It's inflated.
 656 #if INCLUDE_RTM_OPT
 657   if (use_rtm) {
 658     Label L_regular_inflated_unlock;
 659     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 660     movptr(boxReg, Address(tmpReg, owner_offset));
 661     testptr(boxReg, boxReg);
 662     jccb(Assembler::notZero, L_regular_inflated_unlock);
 663     xend();
 664     jmpb(DONE_LABEL);
 665     bind(L_regular_inflated_unlock);
 666   }
 667 #endif
 668 
 669   // Despite our balanced locking property we still check that m->_owner == Self
 670   // as java routines or native JNI code called by this thread might
 671   // have released the lock.
 672   // Refer to the comments in synchronizer.cpp for how we might encode extra
 673   // state in _succ so we can avoid fetching EntryList|cxq.
 674   //
 675   // If there's no contention try a 1-0 exit.  That is, exit without
 676   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 677   // we detect and recover from the race that the 1-0 exit admits.
 678   //
 679   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 680   // before it STs null into _owner, releasing the lock.  Updates
 681   // to data protected by the critical section must be visible before
 682   // we drop the lock (and thus before any other thread could acquire
 683   // the lock and observe the fields protected by the lock).
 684   // IA32's memory-model is SPO, so STs are ordered with respect to
 685   // each other and there's no need for an explicit barrier (fence).
 686   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 687 #ifndef _LP64
 688   get_thread (boxReg);
 689 
 690   // Note that we could employ various encoding schemes to reduce
 691   // the number of loads below (currently 4) to just 2 or 3.
 692   // Refer to the comments in synchronizer.cpp.
 693   // In practice the chain of fetches doesn't seem to impact performance, however.
 694   xorptr(boxReg, boxReg);
 695   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 696   jccb  (Assembler::notZero, DONE_LABEL);
 697   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 698   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 699   jccb  (Assembler::notZero, CheckSucc);
 700   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 701   jmpb  (DONE_LABEL);
 702 
 703   bind (Stacked);
 704   // It's not inflated and it's not recursively stack-locked.
 705   // It must be stack-locked.
 706   // Try to reset the header to displaced header.
 707   // The "box" value on the stack is stable, so we can reload
 708   // and be assured we observe the same value as above.
 709   movptr(tmpReg, Address(boxReg, 0));
 710   lock();
 711   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 712   // Intention fall-thru into DONE_LABEL
 713 
 714   // DONE_LABEL is a hot target - we'd really like to place it at the
 715   // start of cache line by padding with NOPs.
 716   // See the AMD and Intel software optimization manuals for the
 717   // most efficient "long" NOP encodings.
 718   // Unfortunately none of our alignment mechanisms suffice.
 719   bind (CheckSucc);
 720 #else // _LP64
 721   // It's inflated
 722   Label LNotRecursive, LSuccess, LGoSlowPath;
 723 
 724   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 725   jccb(Assembler::equal, LNotRecursive);
 726 
 727   // Recursive inflated unlock
 728   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 729   jmpb(LSuccess);
 730 
 731   bind(LNotRecursive);
 732   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 733   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 734   jccb  (Assembler::notZero, CheckSucc);
 735   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 736   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 737   jmpb  (DONE_LABEL);
 738 
 739   // Try to avoid passing control into the slow_path ...
 740   bind  (CheckSucc);
 741 
 742   // The following optional optimization can be elided if necessary
 743   // Effectively: if (succ == null) goto slow path
 744   // The code reduces the window for a race, however,
 745   // and thus benefits performance.
 746   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 747   jccb  (Assembler::zero, LGoSlowPath);
 748 
 749   xorptr(boxReg, boxReg);
 750   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 751   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 752 
 753   // Memory barrier/fence
 754   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 755   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 756   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 757   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 758   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 759   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 760   lock(); addl(Address(rsp, 0), 0);
 761 
 762   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 763   jccb  (Assembler::notZero, LSuccess);
 764 
 765   // Rare inopportune interleaving - race.
 766   // The successor vanished in the small window above.
 767   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 768   // We need to ensure progress and succession.
 769   // Try to reacquire the lock.
 770   // If that fails then the new owner is responsible for succession and this
 771   // thread needs to take no further action and can exit via the fast path (success).
 772   // If the re-acquire succeeds then pass control into the slow path.
 773   // As implemented, this latter mode is horrible because we generated more
 774   // coherence traffic on the lock *and* artificially extended the critical section
 775   // length while by virtue of passing control into the slow path.
 776 
 777   // box is really RAX -- the following CMPXCHG depends on that binding
 778   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 779   lock();
 780   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 781   // There's no successor so we tried to regrab the lock.
 782   // If that didn't work, then another thread grabbed the
 783   // lock so we're done (and exit was a success).
 784   jccb  (Assembler::notEqual, LSuccess);
 785   // Intentional fall-through into slow path
 786 
 787   bind  (LGoSlowPath);
 788   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 789   jmpb  (DONE_LABEL);
 790 
 791   bind  (LSuccess);
 792   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 793   jmpb  (DONE_LABEL);
 794 
 795   if (!UseHeavyMonitors) {
 796     bind  (Stacked);
 797     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 798     lock();
 799     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 800   }
 801 #endif
 802   bind(DONE_LABEL);
 803 }
 804 
 805 //-------------------------------------------------------------------------------------------
 806 // Generic instructions support for use in .ad files C2 code generation
 807 
 808 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
 809   if (dst != src) {
 810     movdqu(dst, src);
 811   }
 812   if (opcode == Op_AbsVD) {
 813     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr);
 814   } else {
 815     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 816     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr);
 817   }
 818 }
 819 
 820 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
 821   if (opcode == Op_AbsVD) {
 822     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr);
 823   } else {
 824     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 825     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr);
 826   }
 827 }
 828 
 829 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
 830   if (dst != src) {
 831     movdqu(dst, src);
 832   }
 833   if (opcode == Op_AbsVF) {
 834     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);
 835   } else {
 836     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 837     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);
 838   }
 839 }
 840 
 841 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
 842   if (opcode == Op_AbsVF) {
 843     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);
 844   } else {
 845     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 846     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
 847   }
 848 }
 849 
 850 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 851   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 852   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 853 
 854   if (opcode == Op_MinV) {
 855     if (elem_bt == T_BYTE) {
 856       pminsb(dst, src);
 857     } else if (elem_bt == T_SHORT) {
 858       pminsw(dst, src);
 859     } else if (elem_bt == T_INT) {
 860       pminsd(dst, src);
 861     } else {
 862       assert(elem_bt == T_LONG, "required");
 863       assert(tmp == xmm0, "required");
 864       assert_different_registers(dst, src, tmp);
 865       movdqu(xmm0, dst);
 866       pcmpgtq(xmm0, src);
 867       blendvpd(dst, src);  // xmm0 as mask
 868     }
 869   } else { // opcode == Op_MaxV
 870     if (elem_bt == T_BYTE) {
 871       pmaxsb(dst, src);
 872     } else if (elem_bt == T_SHORT) {
 873       pmaxsw(dst, src);
 874     } else if (elem_bt == T_INT) {
 875       pmaxsd(dst, src);
 876     } else {
 877       assert(elem_bt == T_LONG, "required");
 878       assert(tmp == xmm0, "required");
 879       assert_different_registers(dst, src, tmp);
 880       movdqu(xmm0, src);
 881       pcmpgtq(xmm0, dst);
 882       blendvpd(dst, src);  // xmm0 as mask
 883     }
 884   }
 885 }
 886 
 887 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 888                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 889                                  int vlen_enc) {
 890   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 891 
 892   if (opcode == Op_MinV) {
 893     if (elem_bt == T_BYTE) {
 894       vpminsb(dst, src1, src2, vlen_enc);
 895     } else if (elem_bt == T_SHORT) {
 896       vpminsw(dst, src1, src2, vlen_enc);
 897     } else if (elem_bt == T_INT) {
 898       vpminsd(dst, src1, src2, vlen_enc);
 899     } else {
 900       assert(elem_bt == T_LONG, "required");
 901       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 902         vpminsq(dst, src1, src2, vlen_enc);
 903       } else {
 904         assert_different_registers(dst, src1, src2);
 905         vpcmpgtq(dst, src1, src2, vlen_enc);
 906         vblendvpd(dst, src1, src2, dst, vlen_enc);
 907       }
 908     }
 909   } else { // opcode == Op_MaxV
 910     if (elem_bt == T_BYTE) {
 911       vpmaxsb(dst, src1, src2, vlen_enc);
 912     } else if (elem_bt == T_SHORT) {
 913       vpmaxsw(dst, src1, src2, vlen_enc);
 914     } else if (elem_bt == T_INT) {
 915       vpmaxsd(dst, src1, src2, vlen_enc);
 916     } else {
 917       assert(elem_bt == T_LONG, "required");
 918       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 919         vpmaxsq(dst, src1, src2, vlen_enc);
 920       } else {
 921         assert_different_registers(dst, src1, src2);
 922         vpcmpgtq(dst, src1, src2, vlen_enc);
 923         vblendvpd(dst, src2, src1, dst, vlen_enc);
 924       }
 925     }
 926   }
 927 }
 928 
 929 // Float/Double min max
 930 
 931 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 932                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 933                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 934                                    int vlen_enc) {
 935   assert(UseAVX > 0, "required");
 936   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 937          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 938   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 939   assert_different_registers(a, b, tmp, atmp, btmp);
 940 
 941   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 942   bool is_double_word = is_double_word_type(elem_bt);
 943 
 944   if (!is_double_word && is_min) {
 945     vblendvps(atmp, a, b, a, vlen_enc);
 946     vblendvps(btmp, b, a, a, vlen_enc);
 947     vminps(tmp, atmp, btmp, vlen_enc);
 948     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 949     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
 950   } else if (!is_double_word && !is_min) {
 951     vblendvps(btmp, b, a, b, vlen_enc);
 952     vblendvps(atmp, a, b, b, vlen_enc);
 953     vmaxps(tmp, atmp, btmp, vlen_enc);
 954     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 955     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
 956   } else if (is_double_word && is_min) {
 957     vblendvpd(atmp, a, b, a, vlen_enc);
 958     vblendvpd(btmp, b, a, a, vlen_enc);
 959     vminpd(tmp, atmp, btmp, vlen_enc);
 960     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 961     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
 962   } else {
 963     assert(is_double_word && !is_min, "sanity");
 964     vblendvpd(btmp, b, a, b, vlen_enc);
 965     vblendvpd(atmp, a, b, b, vlen_enc);
 966     vmaxpd(tmp, atmp, btmp, vlen_enc);
 967     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 968     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
 969   }
 970 }
 971 
 972 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
 973                                     XMMRegister dst, XMMRegister a, XMMRegister b,
 974                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
 975                                     int vlen_enc) {
 976   assert(UseAVX > 2, "required");
 977   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 978          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 979   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 980   assert_different_registers(dst, a, b, atmp, btmp);
 981 
 982   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 983   bool is_double_word = is_double_word_type(elem_bt);
 984   bool merge = true;
 985 
 986   if (!is_double_word && is_min) {
 987     evpmovd2m(ktmp, a, vlen_enc);
 988     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
 989     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
 990     vminps(dst, atmp, btmp, vlen_enc);
 991     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 992     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
 993   } else if (!is_double_word && !is_min) {
 994     evpmovd2m(ktmp, b, vlen_enc);
 995     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
 996     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
 997     vmaxps(dst, atmp, btmp, vlen_enc);
 998     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 999     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1000   } else if (is_double_word && is_min) {
1001     evpmovq2m(ktmp, a, vlen_enc);
1002     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1003     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1004     vminpd(dst, atmp, btmp, vlen_enc);
1005     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1006     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1007   } else {
1008     assert(is_double_word && !is_min, "sanity");
1009     evpmovq2m(ktmp, b, vlen_enc);
1010     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1011     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1012     vmaxpd(dst, atmp, btmp, vlen_enc);
1013     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1014     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1015   }
1016 }
1017 
1018 // Float/Double signum
1019 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst,
1020                                   XMMRegister zero, XMMRegister one,
1021                                   Register scratch) {
1022   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1023 
1024   Label DONE_LABEL;
1025 
1026   if (opcode == Op_SignumF) {
1027     assert(UseSSE > 0, "required");
1028     ucomiss(dst, zero);
1029     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1030     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1031     movflt(dst, one);
1032     jcc(Assembler::above, DONE_LABEL);
1033     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scratch);
1034   } else if (opcode == Op_SignumD) {
1035     assert(UseSSE > 1, "required");
1036     ucomisd(dst, zero);
1037     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1038     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1039     movdbl(dst, one);
1040     jcc(Assembler::above, DONE_LABEL);
1041     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scratch);
1042   }
1043 
1044   bind(DONE_LABEL);
1045 }
1046 
1047 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1048   if (sign) {
1049     pmovsxbw(dst, src);
1050   } else {
1051     pmovzxbw(dst, src);
1052   }
1053 }
1054 
1055 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1056   if (sign) {
1057     vpmovsxbw(dst, src, vector_len);
1058   } else {
1059     vpmovzxbw(dst, src, vector_len);
1060   }
1061 }
1062 
1063 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1064   if (sign) {
1065     vpmovsxbd(dst, src, vector_len);
1066   } else {
1067     vpmovzxbd(dst, src, vector_len);
1068   }
1069 }
1070 
1071 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1072   if (sign) {
1073     vpmovsxwd(dst, src, vector_len);
1074   } else {
1075     vpmovzxwd(dst, src, vector_len);
1076   }
1077 }
1078 
1079 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1080                                      int shift, int vector_len) {
1081   if (opcode == Op_RotateLeftV) {
1082     if (etype == T_INT) {
1083       evprold(dst, src, shift, vector_len);
1084     } else {
1085       assert(etype == T_LONG, "expected type T_LONG");
1086       evprolq(dst, src, shift, vector_len);
1087     }
1088   } else {
1089     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1090     if (etype == T_INT) {
1091       evprord(dst, src, shift, vector_len);
1092     } else {
1093       assert(etype == T_LONG, "expected type T_LONG");
1094       evprorq(dst, src, shift, vector_len);
1095     }
1096   }
1097 }
1098 
1099 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1100                                      XMMRegister shift, int vector_len) {
1101   if (opcode == Op_RotateLeftV) {
1102     if (etype == T_INT) {
1103       evprolvd(dst, src, shift, vector_len);
1104     } else {
1105       assert(etype == T_LONG, "expected type T_LONG");
1106       evprolvq(dst, src, shift, vector_len);
1107     }
1108   } else {
1109     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1110     if (etype == T_INT) {
1111       evprorvd(dst, src, shift, vector_len);
1112     } else {
1113       assert(etype == T_LONG, "expected type T_LONG");
1114       evprorvq(dst, src, shift, vector_len);
1115     }
1116   }
1117 }
1118 
1119 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1120   if (opcode == Op_RShiftVI) {
1121     psrad(dst, shift);
1122   } else if (opcode == Op_LShiftVI) {
1123     pslld(dst, shift);
1124   } else {
1125     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1126     psrld(dst, shift);
1127   }
1128 }
1129 
1130 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1131   switch (opcode) {
1132     case Op_RShiftVI:  psrad(dst, shift); break;
1133     case Op_LShiftVI:  pslld(dst, shift); break;
1134     case Op_URShiftVI: psrld(dst, shift); break;
1135 
1136     default: assert(false, "%s", NodeClassNames[opcode]);
1137   }
1138 }
1139 
1140 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1141   if (opcode == Op_RShiftVI) {
1142     vpsrad(dst, nds, shift, vector_len);
1143   } else if (opcode == Op_LShiftVI) {
1144     vpslld(dst, nds, shift, vector_len);
1145   } else {
1146     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1147     vpsrld(dst, nds, shift, vector_len);
1148   }
1149 }
1150 
1151 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1152   switch (opcode) {
1153     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1154     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1155     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1156 
1157     default: assert(false, "%s", NodeClassNames[opcode]);
1158   }
1159 }
1160 
1161 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1162   switch (opcode) {
1163     case Op_RShiftVB:  // fall-through
1164     case Op_RShiftVS:  psraw(dst, shift); break;
1165 
1166     case Op_LShiftVB:  // fall-through
1167     case Op_LShiftVS:  psllw(dst, shift);   break;
1168 
1169     case Op_URShiftVS: // fall-through
1170     case Op_URShiftVB: psrlw(dst, shift);  break;
1171 
1172     default: assert(false, "%s", NodeClassNames[opcode]);
1173   }
1174 }
1175 
1176 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1177   switch (opcode) {
1178     case Op_RShiftVB:  // fall-through
1179     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1180 
1181     case Op_LShiftVB:  // fall-through
1182     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1183 
1184     case Op_URShiftVS: // fall-through
1185     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1186 
1187     default: assert(false, "%s", NodeClassNames[opcode]);
1188   }
1189 }
1190 
1191 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1192   switch (opcode) {
1193     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1194     case Op_LShiftVL:  psllq(dst, shift); break;
1195     case Op_URShiftVL: psrlq(dst, shift); break;
1196 
1197     default: assert(false, "%s", NodeClassNames[opcode]);
1198   }
1199 }
1200 
1201 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1202   if (opcode == Op_RShiftVL) {
1203     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1204   } else if (opcode == Op_LShiftVL) {
1205     psllq(dst, shift);
1206   } else {
1207     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1208     psrlq(dst, shift);
1209   }
1210 }
1211 
1212 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1213   switch (opcode) {
1214     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1215     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1216     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1217 
1218     default: assert(false, "%s", NodeClassNames[opcode]);
1219   }
1220 }
1221 
1222 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1223   if (opcode == Op_RShiftVL) {
1224     evpsraq(dst, nds, shift, vector_len);
1225   } else if (opcode == Op_LShiftVL) {
1226     vpsllq(dst, nds, shift, vector_len);
1227   } else {
1228     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1229     vpsrlq(dst, nds, shift, vector_len);
1230   }
1231 }
1232 
1233 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1234   switch (opcode) {
1235     case Op_RShiftVB:  // fall-through
1236     case Op_RShiftVS:  // fall-through
1237     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1238 
1239     case Op_LShiftVB:  // fall-through
1240     case Op_LShiftVS:  // fall-through
1241     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1242 
1243     case Op_URShiftVB: // fall-through
1244     case Op_URShiftVS: // fall-through
1245     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1246 
1247     default: assert(false, "%s", NodeClassNames[opcode]);
1248   }
1249 }
1250 
1251 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1252   switch (opcode) {
1253     case Op_RShiftVB:  // fall-through
1254     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1255 
1256     case Op_LShiftVB:  // fall-through
1257     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1258 
1259     case Op_URShiftVB: // fall-through
1260     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1261 
1262     default: assert(false, "%s", NodeClassNames[opcode]);
1263   }
1264 }
1265 
1266 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1267   assert(UseAVX >= 2, "required");
1268   switch (opcode) {
1269     case Op_RShiftVL: {
1270       if (UseAVX > 2) {
1271         assert(tmp == xnoreg, "not used");
1272         if (!VM_Version::supports_avx512vl()) {
1273           vlen_enc = Assembler::AVX_512bit;
1274         }
1275         evpsravq(dst, src, shift, vlen_enc);
1276       } else {
1277         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1278         vpsrlvq(dst, src, shift, vlen_enc);
1279         vpsrlvq(tmp, tmp, shift, vlen_enc);
1280         vpxor(dst, dst, tmp, vlen_enc);
1281         vpsubq(dst, dst, tmp, vlen_enc);
1282       }
1283       break;
1284     }
1285     case Op_LShiftVL: {
1286       assert(tmp == xnoreg, "not used");
1287       vpsllvq(dst, src, shift, vlen_enc);
1288       break;
1289     }
1290     case Op_URShiftVL: {
1291       assert(tmp == xnoreg, "not used");
1292       vpsrlvq(dst, src, shift, vlen_enc);
1293       break;
1294     }
1295     default: assert(false, "%s", NodeClassNames[opcode]);
1296   }
1297 }
1298 
1299 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1300 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1301   assert(opcode == Op_LShiftVB ||
1302          opcode == Op_RShiftVB ||
1303          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1304   bool sign = (opcode != Op_URShiftVB);
1305   assert(vector_len == 0, "required");
1306   vextendbd(sign, dst, src, 1);
1307   vpmovzxbd(vtmp, shift, 1);
1308   varshiftd(opcode, dst, dst, vtmp, 1);
1309   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch);
1310   vextracti128_high(vtmp, dst);
1311   vpackusdw(dst, dst, vtmp, 0);
1312 }
1313 
1314 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1315 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1316   assert(opcode == Op_LShiftVB ||
1317          opcode == Op_RShiftVB ||
1318          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1319   bool sign = (opcode != Op_URShiftVB);
1320   int ext_vector_len = vector_len + 1;
1321   vextendbw(sign, dst, src, ext_vector_len);
1322   vpmovzxbw(vtmp, shift, ext_vector_len);
1323   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1324   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch);
1325   if (vector_len == 0) {
1326     vextracti128_high(vtmp, dst);
1327     vpackuswb(dst, dst, vtmp, vector_len);
1328   } else {
1329     vextracti64x4_high(vtmp, dst);
1330     vpackuswb(dst, dst, vtmp, vector_len);
1331     vpermq(dst, dst, 0xD8, vector_len);
1332   }
1333 }
1334 
1335 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1336   switch(typ) {
1337     case T_BYTE:
1338       pinsrb(dst, val, idx);
1339       break;
1340     case T_SHORT:
1341       pinsrw(dst, val, idx);
1342       break;
1343     case T_INT:
1344       pinsrd(dst, val, idx);
1345       break;
1346     case T_LONG:
1347       pinsrq(dst, val, idx);
1348       break;
1349     default:
1350       assert(false,"Should not reach here.");
1351       break;
1352   }
1353 }
1354 
1355 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1356   switch(typ) {
1357     case T_BYTE:
1358       vpinsrb(dst, src, val, idx);
1359       break;
1360     case T_SHORT:
1361       vpinsrw(dst, src, val, idx);
1362       break;
1363     case T_INT:
1364       vpinsrd(dst, src, val, idx);
1365       break;
1366     case T_LONG:
1367       vpinsrq(dst, src, val, idx);
1368       break;
1369     default:
1370       assert(false,"Should not reach here.");
1371       break;
1372   }
1373 }
1374 
1375 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1376   switch(typ) {
1377     case T_INT:
1378       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1379       break;
1380     case T_FLOAT:
1381       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1382       break;
1383     case T_LONG:
1384       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1385       break;
1386     case T_DOUBLE:
1387       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1388       break;
1389     default:
1390       assert(false,"Should not reach here.");
1391       break;
1392   }
1393 }
1394 
1395 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1396   switch(typ) {
1397     case T_INT:
1398       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1399       break;
1400     case T_FLOAT:
1401       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1402       break;
1403     case T_LONG:
1404       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1405       break;
1406     case T_DOUBLE:
1407       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1408       break;
1409     default:
1410       assert(false,"Should not reach here.");
1411       break;
1412   }
1413 }
1414 
1415 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1416   switch(typ) {
1417     case T_INT:
1418       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1419       break;
1420     case T_FLOAT:
1421       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1422       break;
1423     case T_LONG:
1424       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1425       break;
1426     case T_DOUBLE:
1427       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1428       break;
1429     default:
1430       assert(false,"Should not reach here.");
1431       break;
1432   }
1433 }
1434 
1435 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1436   if (vlen_in_bytes <= 16) {
1437     pxor (dst, dst);
1438     psubb(dst, src);
1439     switch (elem_bt) {
1440       case T_BYTE:   /* nothing to do */ break;
1441       case T_SHORT:  pmovsxbw(dst, dst); break;
1442       case T_INT:    pmovsxbd(dst, dst); break;
1443       case T_FLOAT:  pmovsxbd(dst, dst); break;
1444       case T_LONG:   pmovsxbq(dst, dst); break;
1445       case T_DOUBLE: pmovsxbq(dst, dst); break;
1446 
1447       default: assert(false, "%s", type2name(elem_bt));
1448     }
1449   } else {
1450     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1451     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1452 
1453     vpxor (dst, dst, dst, vlen_enc);
1454     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1455 
1456     switch (elem_bt) {
1457       case T_BYTE:   /* nothing to do */            break;
1458       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1459       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1460       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1461       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1462       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1463 
1464       default: assert(false, "%s", type2name(elem_bt));
1465     }
1466   }
1467 }
1468 
1469 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp,
1470                                          Register tmp, bool novlbwdq, int vlen_enc) {
1471   if (novlbwdq) {
1472     vpmovsxbd(xtmp, src, vlen_enc);
1473     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1474             Assembler::eq, true, vlen_enc, tmp);
1475   } else {
1476     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1477     vpsubb(xtmp, xtmp, src, vlen_enc);
1478     evpmovb2m(dst, xtmp, vlen_enc);
1479   }
1480 }
1481 
1482 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
1483   switch (vlen_in_bytes) {
1484   case 4:  movdl(dst, src);   break;
1485   case 8:  movq(dst, src);    break;
1486   case 16: movdqu(dst, src);  break;
1487   case 32: vmovdqu(dst, src); break;
1488   case 64: evmovdquq(dst, src, Assembler::AVX_512bit); break;
1489   default: ShouldNotReachHere();
1490   }
1491 }
1492 
1493 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1494   if (reachable(src)) {
1495     load_vector(dst, as_Address(src), vlen_in_bytes);
1496   } else {
1497     lea(rscratch, src);
1498     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
1499   }
1500 }
1501 
1502 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) {
1503   ExternalAddress addr(StubRoutines::x86::vector_iota_indices());
1504   if (vlen_in_bytes == 4) {
1505     movdl(dst, addr);
1506   } else if (vlen_in_bytes == 8) {
1507     movq(dst, addr);
1508   } else if (vlen_in_bytes == 16) {
1509     movdqu(dst, addr, scratch);
1510   } else if (vlen_in_bytes == 32) {
1511     vmovdqu(dst, addr, scratch);
1512   } else {
1513     assert(vlen_in_bytes == 64, "%d", vlen_in_bytes);
1514     evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch);
1515   }
1516 }
1517 
1518 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1519 
1520 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1521   int vector_len = Assembler::AVX_128bit;
1522 
1523   switch (opcode) {
1524     case Op_AndReductionV:  pand(dst, src); break;
1525     case Op_OrReductionV:   por (dst, src); break;
1526     case Op_XorReductionV:  pxor(dst, src); break;
1527     case Op_MinReductionV:
1528       switch (typ) {
1529         case T_BYTE:        pminsb(dst, src); break;
1530         case T_SHORT:       pminsw(dst, src); break;
1531         case T_INT:         pminsd(dst, src); break;
1532         case T_LONG:        assert(UseAVX > 2, "required");
1533                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1534         default:            assert(false, "wrong type");
1535       }
1536       break;
1537     case Op_MaxReductionV:
1538       switch (typ) {
1539         case T_BYTE:        pmaxsb(dst, src); break;
1540         case T_SHORT:       pmaxsw(dst, src); break;
1541         case T_INT:         pmaxsd(dst, src); break;
1542         case T_LONG:        assert(UseAVX > 2, "required");
1543                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1544         default:            assert(false, "wrong type");
1545       }
1546       break;
1547     case Op_AddReductionVF: addss(dst, src); break;
1548     case Op_AddReductionVD: addsd(dst, src); break;
1549     case Op_AddReductionVI:
1550       switch (typ) {
1551         case T_BYTE:        paddb(dst, src); break;
1552         case T_SHORT:       paddw(dst, src); break;
1553         case T_INT:         paddd(dst, src); break;
1554         default:            assert(false, "wrong type");
1555       }
1556       break;
1557     case Op_AddReductionVL: paddq(dst, src); break;
1558     case Op_MulReductionVF: mulss(dst, src); break;
1559     case Op_MulReductionVD: mulsd(dst, src); break;
1560     case Op_MulReductionVI:
1561       switch (typ) {
1562         case T_SHORT:       pmullw(dst, src); break;
1563         case T_INT:         pmulld(dst, src); break;
1564         default:            assert(false, "wrong type");
1565       }
1566       break;
1567     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1568                             vpmullq(dst, dst, src, vector_len); break;
1569     default:                assert(false, "wrong opcode");
1570   }
1571 }
1572 
1573 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1574   int vector_len = Assembler::AVX_256bit;
1575 
1576   switch (opcode) {
1577     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1578     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1579     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1580     case Op_MinReductionV:
1581       switch (typ) {
1582         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1583         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1584         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1585         case T_LONG:        assert(UseAVX > 2, "required");
1586                             vpminsq(dst, src1, src2, vector_len); break;
1587         default:            assert(false, "wrong type");
1588       }
1589       break;
1590     case Op_MaxReductionV:
1591       switch (typ) {
1592         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1593         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1594         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1595         case T_LONG:        assert(UseAVX > 2, "required");
1596                             vpmaxsq(dst, src1, src2, vector_len); break;
1597         default:            assert(false, "wrong type");
1598       }
1599       break;
1600     case Op_AddReductionVI:
1601       switch (typ) {
1602         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1603         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1604         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1605         default:            assert(false, "wrong type");
1606       }
1607       break;
1608     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1609     case Op_MulReductionVI:
1610       switch (typ) {
1611         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1612         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1613         default:            assert(false, "wrong type");
1614       }
1615       break;
1616     case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
1617     default:                assert(false, "wrong opcode");
1618   }
1619 }
1620 
1621 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1622                                   XMMRegister dst, XMMRegister src,
1623                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1624   switch (opcode) {
1625     case Op_AddReductionVF:
1626     case Op_MulReductionVF:
1627       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1628       break;
1629 
1630     case Op_AddReductionVD:
1631     case Op_MulReductionVD:
1632       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1633       break;
1634 
1635     default: assert(false, "wrong opcode");
1636   }
1637 }
1638 
1639 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1640                              Register dst, Register src1, XMMRegister src2,
1641                              XMMRegister vtmp1, XMMRegister vtmp2) {
1642   switch (vlen) {
1643     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1644     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1645     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1646     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1647 
1648     default: assert(false, "wrong vector length");
1649   }
1650 }
1651 
1652 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1653                              Register dst, Register src1, XMMRegister src2,
1654                              XMMRegister vtmp1, XMMRegister vtmp2) {
1655   switch (vlen) {
1656     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1657     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1658     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1659     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1660 
1661     default: assert(false, "wrong vector length");
1662   }
1663 }
1664 
1665 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1666                              Register dst, Register src1, XMMRegister src2,
1667                              XMMRegister vtmp1, XMMRegister vtmp2) {
1668   switch (vlen) {
1669     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1670     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1671     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1672     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1673 
1674     default: assert(false, "wrong vector length");
1675   }
1676 }
1677 
1678 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1679                              Register dst, Register src1, XMMRegister src2,
1680                              XMMRegister vtmp1, XMMRegister vtmp2) {
1681   switch (vlen) {
1682     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1683     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1684     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1685     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1686 
1687     default: assert(false, "wrong vector length");
1688   }
1689 }
1690 
1691 #ifdef _LP64
1692 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1693                              Register dst, Register src1, XMMRegister src2,
1694                              XMMRegister vtmp1, XMMRegister vtmp2) {
1695   switch (vlen) {
1696     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1697     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1698     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1699 
1700     default: assert(false, "wrong vector length");
1701   }
1702 }
1703 #endif // _LP64
1704 
1705 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1706   switch (vlen) {
1707     case 2:
1708       assert(vtmp2 == xnoreg, "");
1709       reduce2F(opcode, dst, src, vtmp1);
1710       break;
1711     case 4:
1712       assert(vtmp2 == xnoreg, "");
1713       reduce4F(opcode, dst, src, vtmp1);
1714       break;
1715     case 8:
1716       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1717       break;
1718     case 16:
1719       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1720       break;
1721     default: assert(false, "wrong vector length");
1722   }
1723 }
1724 
1725 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1726   switch (vlen) {
1727     case 2:
1728       assert(vtmp2 == xnoreg, "");
1729       reduce2D(opcode, dst, src, vtmp1);
1730       break;
1731     case 4:
1732       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1733       break;
1734     case 8:
1735       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1736       break;
1737     default: assert(false, "wrong vector length");
1738   }
1739 }
1740 
1741 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1742   if (opcode == Op_AddReductionVI) {
1743     if (vtmp1 != src2) {
1744       movdqu(vtmp1, src2);
1745     }
1746     phaddd(vtmp1, vtmp1);
1747   } else {
1748     pshufd(vtmp1, src2, 0x1);
1749     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1750   }
1751   movdl(vtmp2, src1);
1752   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1753   movdl(dst, vtmp1);
1754 }
1755 
1756 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1757   if (opcode == Op_AddReductionVI) {
1758     if (vtmp1 != src2) {
1759       movdqu(vtmp1, src2);
1760     }
1761     phaddd(vtmp1, src2);
1762     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1763   } else {
1764     pshufd(vtmp2, src2, 0xE);
1765     reduce_operation_128(T_INT, opcode, vtmp2, src2);
1766     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1767   }
1768 }
1769 
1770 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1771   if (opcode == Op_AddReductionVI) {
1772     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1773     vextracti128_high(vtmp2, vtmp1);
1774     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1775     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1776   } else {
1777     vextracti128_high(vtmp1, src2);
1778     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1779     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1780   }
1781 }
1782 
1783 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1784   vextracti64x4_high(vtmp2, src2);
1785   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
1786   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1787 }
1788 
1789 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1790   pshufd(vtmp2, src2, 0x1);
1791   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1792   movdqu(vtmp1, vtmp2);
1793   psrldq(vtmp1, 2);
1794   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1795   movdqu(vtmp2, vtmp1);
1796   psrldq(vtmp2, 1);
1797   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1798   movdl(vtmp2, src1);
1799   pmovsxbd(vtmp1, vtmp1);
1800   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1801   pextrb(dst, vtmp1, 0x0);
1802   movsbl(dst, dst);
1803 }
1804 
1805 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1806   pshufd(vtmp1, src2, 0xE);
1807   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
1808   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1809 }
1810 
1811 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1812   vextracti128_high(vtmp2, src2);
1813   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1814   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1815 }
1816 
1817 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1818   vextracti64x4_high(vtmp1, src2);
1819   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
1820   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1821 }
1822 
1823 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1824   pmovsxbw(vtmp2, src2);
1825   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1826 }
1827 
1828 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1829   if (UseAVX > 1) {
1830     int vector_len = Assembler::AVX_256bit;
1831     vpmovsxbw(vtmp1, src2, vector_len);
1832     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1833   } else {
1834     pmovsxbw(vtmp2, src2);
1835     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1836     pshufd(vtmp2, src2, 0x1);
1837     pmovsxbw(vtmp2, src2);
1838     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1839   }
1840 }
1841 
1842 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1843   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
1844     int vector_len = Assembler::AVX_512bit;
1845     vpmovsxbw(vtmp1, src2, vector_len);
1846     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1847   } else {
1848     assert(UseAVX >= 2,"Should not reach here.");
1849     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
1850     vextracti128_high(vtmp2, src2);
1851     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1852   }
1853 }
1854 
1855 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1856   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
1857   vextracti64x4_high(vtmp2, src2);
1858   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1859 }
1860 
1861 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1862   if (opcode == Op_AddReductionVI) {
1863     if (vtmp1 != src2) {
1864       movdqu(vtmp1, src2);
1865     }
1866     phaddw(vtmp1, vtmp1);
1867     phaddw(vtmp1, vtmp1);
1868   } else {
1869     pshufd(vtmp2, src2, 0x1);
1870     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1871     movdqu(vtmp1, vtmp2);
1872     psrldq(vtmp1, 2);
1873     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
1874   }
1875   movdl(vtmp2, src1);
1876   pmovsxwd(vtmp1, vtmp1);
1877   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1878   pextrw(dst, vtmp1, 0x0);
1879   movswl(dst, dst);
1880 }
1881 
1882 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1883   if (opcode == Op_AddReductionVI) {
1884     if (vtmp1 != src2) {
1885       movdqu(vtmp1, src2);
1886     }
1887     phaddw(vtmp1, src2);
1888   } else {
1889     pshufd(vtmp1, src2, 0xE);
1890     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
1891   }
1892   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1893 }
1894 
1895 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1896   if (opcode == Op_AddReductionVI) {
1897     int vector_len = Assembler::AVX_256bit;
1898     vphaddw(vtmp2, src2, src2, vector_len);
1899     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
1900   } else {
1901     vextracti128_high(vtmp2, src2);
1902     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1903   }
1904   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1905 }
1906 
1907 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1908   int vector_len = Assembler::AVX_256bit;
1909   vextracti64x4_high(vtmp1, src2);
1910   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
1911   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1912 }
1913 
1914 #ifdef _LP64
1915 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1916   pshufd(vtmp2, src2, 0xE);
1917   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
1918   movdq(vtmp1, src1);
1919   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
1920   movdq(dst, vtmp1);
1921 }
1922 
1923 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1924   vextracti128_high(vtmp1, src2);
1925   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
1926   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1927 }
1928 
1929 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1930   vextracti64x4_high(vtmp2, src2);
1931   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
1932   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1933 }
1934 
1935 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
1936   mov64(temp, -1L);
1937   bzhiq(temp, temp, len);
1938   kmovql(dst, temp);
1939 }
1940 #endif // _LP64
1941 
1942 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1943   reduce_operation_128(T_FLOAT, opcode, dst, src);
1944   pshufd(vtmp, src, 0x1);
1945   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1946 }
1947 
1948 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1949   reduce2F(opcode, dst, src, vtmp);
1950   pshufd(vtmp, src, 0x2);
1951   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1952   pshufd(vtmp, src, 0x3);
1953   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1954 }
1955 
1956 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1957   reduce4F(opcode, dst, src, vtmp2);
1958   vextractf128_high(vtmp2, src);
1959   reduce4F(opcode, dst, vtmp2, vtmp1);
1960 }
1961 
1962 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1963   reduce8F(opcode, dst, src, vtmp1, vtmp2);
1964   vextracti64x4_high(vtmp1, src);
1965   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
1966 }
1967 
1968 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1969   reduce_operation_128(T_DOUBLE, opcode, dst, src);
1970   pshufd(vtmp, src, 0xE);
1971   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
1972 }
1973 
1974 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1975   reduce2D(opcode, dst, src, vtmp2);
1976   vextractf128_high(vtmp2, src);
1977   reduce2D(opcode, dst, vtmp2, vtmp1);
1978 }
1979 
1980 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1981   reduce4D(opcode, dst, src, vtmp1, vtmp2);
1982   vextracti64x4_high(vtmp1, src);
1983   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
1984 }
1985 
1986 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) {
1987   MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
1988 }
1989 
1990 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) {
1991   MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
1992 }
1993 
1994 
1995 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
1996                                           XMMRegister dst, XMMRegister src,
1997                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1998                                           XMMRegister xmm_0, XMMRegister xmm_1) {
1999   int permconst[] = {1, 14};
2000   XMMRegister wsrc = src;
2001   XMMRegister wdst = xmm_0;
2002   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2003 
2004   int vlen_enc = Assembler::AVX_128bit;
2005   if (vlen == 16) {
2006     vlen_enc = Assembler::AVX_256bit;
2007   }
2008 
2009   for (int i = log2(vlen) - 1; i >=0; i--) {
2010     if (i == 0 && !is_dst_valid) {
2011       wdst = dst;
2012     }
2013     if (i == 3) {
2014       vextracti64x4_high(wtmp, wsrc);
2015     } else if (i == 2) {
2016       vextracti128_high(wtmp, wsrc);
2017     } else { // i = [0,1]
2018       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2019     }
2020     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2021     wsrc = wdst;
2022     vlen_enc = Assembler::AVX_128bit;
2023   }
2024   if (is_dst_valid) {
2025     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2026   }
2027 }
2028 
2029 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2030                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2031                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2032   XMMRegister wsrc = src;
2033   XMMRegister wdst = xmm_0;
2034   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2035   int vlen_enc = Assembler::AVX_128bit;
2036   if (vlen == 8) {
2037     vlen_enc = Assembler::AVX_256bit;
2038   }
2039   for (int i = log2(vlen) - 1; i >=0; i--) {
2040     if (i == 0 && !is_dst_valid) {
2041       wdst = dst;
2042     }
2043     if (i == 1) {
2044       vextracti128_high(wtmp, wsrc);
2045     } else if (i == 2) {
2046       vextracti64x4_high(wtmp, wsrc);
2047     } else {
2048       assert(i == 0, "%d", i);
2049       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2050     }
2051     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2052     wsrc = wdst;
2053     vlen_enc = Assembler::AVX_128bit;
2054   }
2055   if (is_dst_valid) {
2056     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2057   }
2058 }
2059 
2060 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2061   switch (bt) {
2062     case T_BYTE:  pextrb(dst, src, idx); break;
2063     case T_SHORT: pextrw(dst, src, idx); break;
2064     case T_INT:   pextrd(dst, src, idx); break;
2065     case T_LONG:  pextrq(dst, src, idx); break;
2066 
2067     default:
2068       assert(false,"Should not reach here.");
2069       break;
2070   }
2071 }
2072 
2073 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2074   int esize =  type2aelembytes(typ);
2075   int elem_per_lane = 16/esize;
2076   int lane = elemindex / elem_per_lane;
2077   int eindex = elemindex % elem_per_lane;
2078 
2079   if (lane >= 2) {
2080     assert(UseAVX > 2, "required");
2081     vextractf32x4(dst, src, lane & 3);
2082     return dst;
2083   } else if (lane > 0) {
2084     assert(UseAVX > 0, "required");
2085     vextractf128(dst, src, lane);
2086     return dst;
2087   } else {
2088     return src;
2089   }
2090 }
2091 
2092 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2093   int esize =  type2aelembytes(typ);
2094   int elem_per_lane = 16/esize;
2095   int eindex = elemindex % elem_per_lane;
2096   assert(is_integral_type(typ),"required");
2097 
2098   if (eindex == 0) {
2099     if (typ == T_LONG) {
2100       movq(dst, src);
2101     } else {
2102       movdl(dst, src);
2103       if (typ == T_BYTE)
2104         movsbl(dst, dst);
2105       else if (typ == T_SHORT)
2106         movswl(dst, dst);
2107     }
2108   } else {
2109     extract(typ, dst, src, eindex);
2110   }
2111 }
2112 
2113 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) {
2114   int esize =  type2aelembytes(typ);
2115   int elem_per_lane = 16/esize;
2116   int eindex = elemindex % elem_per_lane;
2117   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2118 
2119   if (eindex == 0) {
2120     movq(dst, src);
2121   } else {
2122     if (typ == T_FLOAT) {
2123       if (UseAVX == 0) {
2124         movdqu(dst, src);
2125         pshufps(dst, dst, eindex);
2126       } else {
2127         vpshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2128       }
2129     } else {
2130       if (UseAVX == 0) {
2131         movdqu(dst, src);
2132         psrldq(dst, eindex*esize);
2133       } else {
2134         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2135       }
2136       movq(dst, dst);
2137     }
2138   }
2139   // Zero upper bits
2140   if (typ == T_FLOAT) {
2141     if (UseAVX == 0) {
2142       assert((vtmp != xnoreg) && (tmp != noreg), "required.");
2143       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp);
2144       pand(dst, vtmp);
2145     } else {
2146       assert((tmp != noreg), "required.");
2147       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp);
2148     }
2149   }
2150 }
2151 
2152 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2153   switch(typ) {
2154     case T_BYTE:
2155     case T_BOOLEAN:
2156       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2157       break;
2158     case T_SHORT:
2159     case T_CHAR:
2160       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2161       break;
2162     case T_INT:
2163     case T_FLOAT:
2164       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2165       break;
2166     case T_LONG:
2167     case T_DOUBLE:
2168       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2169       break;
2170     default:
2171       assert(false,"Should not reach here.");
2172       break;
2173   }
2174 }
2175 
2176 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) {
2177   switch(typ) {
2178     case T_BOOLEAN:
2179     case T_BYTE:
2180       evpcmpb(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2181       break;
2182     case T_CHAR:
2183     case T_SHORT:
2184       evpcmpw(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2185       break;
2186     case T_INT:
2187     case T_FLOAT:
2188       evpcmpd(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2189       break;
2190     case T_LONG:
2191     case T_DOUBLE:
2192       evpcmpq(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2193       break;
2194     default:
2195       assert(false,"Should not reach here.");
2196       break;
2197   }
2198 }
2199 
2200 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2201   switch(typ) {
2202     case T_BYTE:
2203       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2204       break;
2205     case T_SHORT:
2206       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2207       break;
2208     case T_INT:
2209     case T_FLOAT:
2210       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2211       break;
2212     case T_LONG:
2213     case T_DOUBLE:
2214       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2215       break;
2216     default:
2217       assert(false,"Should not reach here.");
2218       break;
2219   }
2220 }
2221 
2222 void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2,
2223                                    XMMRegister vtmp1, XMMRegister vtmp2, KRegister mask) {
2224   switch(vlen) {
2225     case 4:
2226       assert(vtmp1 != xnoreg, "required.");
2227       // Broadcast lower 32 bits to 128 bits before ptest
2228       pshufd(vtmp1, src1, 0x0);
2229       if (bt == BoolTest::overflow) {
2230         assert(vtmp2 != xnoreg, "required.");
2231         pshufd(vtmp2, src2, 0x0);
2232       } else {
2233         assert(vtmp2 == xnoreg, "required.");
2234         vtmp2 = src2;
2235       }
2236       ptest(vtmp1, vtmp2);
2237      break;
2238     case 8:
2239       assert(vtmp1 != xnoreg, "required.");
2240       // Broadcast lower 64 bits to 128 bits before ptest
2241       pshufd(vtmp1, src1, 0x4);
2242       if (bt == BoolTest::overflow) {
2243         assert(vtmp2 != xnoreg, "required.");
2244         pshufd(vtmp2, src2, 0x4);
2245       } else {
2246         assert(vtmp2 == xnoreg, "required.");
2247         vtmp2 = src2;
2248       }
2249       ptest(vtmp1, vtmp2);
2250      break;
2251     case 16:
2252       assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2253       ptest(src1, src2);
2254       break;
2255     case 32:
2256       assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2257       vptest(src1, src2, Assembler::AVX_256bit);
2258       break;
2259     case 64:
2260       {
2261         assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2262         evpcmpeqb(mask, src1, src2, Assembler::AVX_512bit);
2263         if (bt == BoolTest::ne) {
2264           ktestql(mask, mask);
2265         } else {
2266           assert(bt == BoolTest::overflow, "required");
2267           kortestql(mask, mask);
2268         }
2269       }
2270       break;
2271     default:
2272       assert(false,"Should not reach here.");
2273       break;
2274   }
2275 }
2276 
2277 //-------------------------------------------------------------------------------------------
2278 
2279 // IndexOf for constant substrings with size >= 8 chars
2280 // which don't need to be loaded through stack.
2281 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2282                                          Register cnt1, Register cnt2,
2283                                          int int_cnt2,  Register result,
2284                                          XMMRegister vec, Register tmp,
2285                                          int ae) {
2286   ShortBranchVerifier sbv(this);
2287   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2288   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2289 
2290   // This method uses the pcmpestri instruction with bound registers
2291   //   inputs:
2292   //     xmm - substring
2293   //     rax - substring length (elements count)
2294   //     mem - scanned string
2295   //     rdx - string length (elements count)
2296   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2297   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2298   //   outputs:
2299   //     rcx - matched index in string
2300   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2301   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2302   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2303   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2304   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2305 
2306   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2307         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2308         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2309 
2310   // Note, inline_string_indexOf() generates checks:
2311   // if (substr.count > string.count) return -1;
2312   // if (substr.count == 0) return 0;
2313   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2314 
2315   // Load substring.
2316   if (ae == StrIntrinsicNode::UL) {
2317     pmovzxbw(vec, Address(str2, 0));
2318   } else {
2319     movdqu(vec, Address(str2, 0));
2320   }
2321   movl(cnt2, int_cnt2);
2322   movptr(result, str1); // string addr
2323 
2324   if (int_cnt2 > stride) {
2325     jmpb(SCAN_TO_SUBSTR);
2326 
2327     // Reload substr for rescan, this code
2328     // is executed only for large substrings (> 8 chars)
2329     bind(RELOAD_SUBSTR);
2330     if (ae == StrIntrinsicNode::UL) {
2331       pmovzxbw(vec, Address(str2, 0));
2332     } else {
2333       movdqu(vec, Address(str2, 0));
2334     }
2335     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2336 
2337     bind(RELOAD_STR);
2338     // We came here after the beginning of the substring was
2339     // matched but the rest of it was not so we need to search
2340     // again. Start from the next element after the previous match.
2341 
2342     // cnt2 is number of substring reminding elements and
2343     // cnt1 is number of string reminding elements when cmp failed.
2344     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2345     subl(cnt1, cnt2);
2346     addl(cnt1, int_cnt2);
2347     movl(cnt2, int_cnt2); // Now restore cnt2
2348 
2349     decrementl(cnt1);     // Shift to next element
2350     cmpl(cnt1, cnt2);
2351     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2352 
2353     addptr(result, (1<<scale1));
2354 
2355   } // (int_cnt2 > 8)
2356 
2357   // Scan string for start of substr in 16-byte vectors
2358   bind(SCAN_TO_SUBSTR);
2359   pcmpestri(vec, Address(result, 0), mode);
2360   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2361   subl(cnt1, stride);
2362   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2363   cmpl(cnt1, cnt2);
2364   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2365   addptr(result, 16);
2366   jmpb(SCAN_TO_SUBSTR);
2367 
2368   // Found a potential substr
2369   bind(FOUND_CANDIDATE);
2370   // Matched whole vector if first element matched (tmp(rcx) == 0).
2371   if (int_cnt2 == stride) {
2372     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2373   } else { // int_cnt2 > 8
2374     jccb(Assembler::overflow, FOUND_SUBSTR);
2375   }
2376   // After pcmpestri tmp(rcx) contains matched element index
2377   // Compute start addr of substr
2378   lea(result, Address(result, tmp, scale1));
2379 
2380   // Make sure string is still long enough
2381   subl(cnt1, tmp);
2382   cmpl(cnt1, cnt2);
2383   if (int_cnt2 == stride) {
2384     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2385   } else { // int_cnt2 > 8
2386     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2387   }
2388   // Left less then substring.
2389 
2390   bind(RET_NOT_FOUND);
2391   movl(result, -1);
2392   jmp(EXIT);
2393 
2394   if (int_cnt2 > stride) {
2395     // This code is optimized for the case when whole substring
2396     // is matched if its head is matched.
2397     bind(MATCH_SUBSTR_HEAD);
2398     pcmpestri(vec, Address(result, 0), mode);
2399     // Reload only string if does not match
2400     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2401 
2402     Label CONT_SCAN_SUBSTR;
2403     // Compare the rest of substring (> 8 chars).
2404     bind(FOUND_SUBSTR);
2405     // First 8 chars are already matched.
2406     negptr(cnt2);
2407     addptr(cnt2, stride);
2408 
2409     bind(SCAN_SUBSTR);
2410     subl(cnt1, stride);
2411     cmpl(cnt2, -stride); // Do not read beyond substring
2412     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2413     // Back-up strings to avoid reading beyond substring:
2414     // cnt1 = cnt1 - cnt2 + 8
2415     addl(cnt1, cnt2); // cnt2 is negative
2416     addl(cnt1, stride);
2417     movl(cnt2, stride); negptr(cnt2);
2418     bind(CONT_SCAN_SUBSTR);
2419     if (int_cnt2 < (int)G) {
2420       int tail_off1 = int_cnt2<<scale1;
2421       int tail_off2 = int_cnt2<<scale2;
2422       if (ae == StrIntrinsicNode::UL) {
2423         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2424       } else {
2425         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2426       }
2427       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2428     } else {
2429       // calculate index in register to avoid integer overflow (int_cnt2*2)
2430       movl(tmp, int_cnt2);
2431       addptr(tmp, cnt2);
2432       if (ae == StrIntrinsicNode::UL) {
2433         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2434       } else {
2435         movdqu(vec, Address(str2, tmp, scale2, 0));
2436       }
2437       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2438     }
2439     // Need to reload strings pointers if not matched whole vector
2440     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2441     addptr(cnt2, stride);
2442     jcc(Assembler::negative, SCAN_SUBSTR);
2443     // Fall through if found full substring
2444 
2445   } // (int_cnt2 > 8)
2446 
2447   bind(RET_FOUND);
2448   // Found result if we matched full small substring.
2449   // Compute substr offset
2450   subptr(result, str1);
2451   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2452     shrl(result, 1); // index
2453   }
2454   bind(EXIT);
2455 
2456 } // string_indexofC8
2457 
2458 // Small strings are loaded through stack if they cross page boundary.
2459 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2460                                        Register cnt1, Register cnt2,
2461                                        int int_cnt2,  Register result,
2462                                        XMMRegister vec, Register tmp,
2463                                        int ae) {
2464   ShortBranchVerifier sbv(this);
2465   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2466   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2467 
2468   //
2469   // int_cnt2 is length of small (< 8 chars) constant substring
2470   // or (-1) for non constant substring in which case its length
2471   // is in cnt2 register.
2472   //
2473   // Note, inline_string_indexOf() generates checks:
2474   // if (substr.count > string.count) return -1;
2475   // if (substr.count == 0) return 0;
2476   //
2477   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2478   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2479   // This method uses the pcmpestri instruction with bound registers
2480   //   inputs:
2481   //     xmm - substring
2482   //     rax - substring length (elements count)
2483   //     mem - scanned string
2484   //     rdx - string length (elements count)
2485   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2486   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2487   //   outputs:
2488   //     rcx - matched index in string
2489   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2490   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2491   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2492   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2493 
2494   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2495         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2496         FOUND_CANDIDATE;
2497 
2498   { //========================================================
2499     // We don't know where these strings are located
2500     // and we can't read beyond them. Load them through stack.
2501     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2502 
2503     movptr(tmp, rsp); // save old SP
2504 
2505     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2506       if (int_cnt2 == (1>>scale2)) { // One byte
2507         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2508         load_unsigned_byte(result, Address(str2, 0));
2509         movdl(vec, result); // move 32 bits
2510       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2511         // Not enough header space in 32-bit VM: 12+3 = 15.
2512         movl(result, Address(str2, -1));
2513         shrl(result, 8);
2514         movdl(vec, result); // move 32 bits
2515       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2516         load_unsigned_short(result, Address(str2, 0));
2517         movdl(vec, result); // move 32 bits
2518       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2519         movdl(vec, Address(str2, 0)); // move 32 bits
2520       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2521         movq(vec, Address(str2, 0));  // move 64 bits
2522       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2523         // Array header size is 12 bytes in 32-bit VM
2524         // + 6 bytes for 3 chars == 18 bytes,
2525         // enough space to load vec and shift.
2526         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2527         if (ae == StrIntrinsicNode::UL) {
2528           int tail_off = int_cnt2-8;
2529           pmovzxbw(vec, Address(str2, tail_off));
2530           psrldq(vec, -2*tail_off);
2531         }
2532         else {
2533           int tail_off = int_cnt2*(1<<scale2);
2534           movdqu(vec, Address(str2, tail_off-16));
2535           psrldq(vec, 16-tail_off);
2536         }
2537       }
2538     } else { // not constant substring
2539       cmpl(cnt2, stride);
2540       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2541 
2542       // We can read beyond string if srt+16 does not cross page boundary
2543       // since heaps are aligned and mapped by pages.
2544       assert(os::vm_page_size() < (int)G, "default page should be small");
2545       movl(result, str2); // We need only low 32 bits
2546       andl(result, (os::vm_page_size()-1));
2547       cmpl(result, (os::vm_page_size()-16));
2548       jccb(Assembler::belowEqual, CHECK_STR);
2549 
2550       // Move small strings to stack to allow load 16 bytes into vec.
2551       subptr(rsp, 16);
2552       int stk_offset = wordSize-(1<<scale2);
2553       push(cnt2);
2554 
2555       bind(COPY_SUBSTR);
2556       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2557         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2558         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2559       } else if (ae == StrIntrinsicNode::UU) {
2560         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2561         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2562       }
2563       decrement(cnt2);
2564       jccb(Assembler::notZero, COPY_SUBSTR);
2565 
2566       pop(cnt2);
2567       movptr(str2, rsp);  // New substring address
2568     } // non constant
2569 
2570     bind(CHECK_STR);
2571     cmpl(cnt1, stride);
2572     jccb(Assembler::aboveEqual, BIG_STRINGS);
2573 
2574     // Check cross page boundary.
2575     movl(result, str1); // We need only low 32 bits
2576     andl(result, (os::vm_page_size()-1));
2577     cmpl(result, (os::vm_page_size()-16));
2578     jccb(Assembler::belowEqual, BIG_STRINGS);
2579 
2580     subptr(rsp, 16);
2581     int stk_offset = -(1<<scale1);
2582     if (int_cnt2 < 0) { // not constant
2583       push(cnt2);
2584       stk_offset += wordSize;
2585     }
2586     movl(cnt2, cnt1);
2587 
2588     bind(COPY_STR);
2589     if (ae == StrIntrinsicNode::LL) {
2590       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2591       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2592     } else {
2593       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2594       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2595     }
2596     decrement(cnt2);
2597     jccb(Assembler::notZero, COPY_STR);
2598 
2599     if (int_cnt2 < 0) { // not constant
2600       pop(cnt2);
2601     }
2602     movptr(str1, rsp);  // New string address
2603 
2604     bind(BIG_STRINGS);
2605     // Load substring.
2606     if (int_cnt2 < 0) { // -1
2607       if (ae == StrIntrinsicNode::UL) {
2608         pmovzxbw(vec, Address(str2, 0));
2609       } else {
2610         movdqu(vec, Address(str2, 0));
2611       }
2612       push(cnt2);       // substr count
2613       push(str2);       // substr addr
2614       push(str1);       // string addr
2615     } else {
2616       // Small (< 8 chars) constant substrings are loaded already.
2617       movl(cnt2, int_cnt2);
2618     }
2619     push(tmp);  // original SP
2620 
2621   } // Finished loading
2622 
2623   //========================================================
2624   // Start search
2625   //
2626 
2627   movptr(result, str1); // string addr
2628 
2629   if (int_cnt2  < 0) {  // Only for non constant substring
2630     jmpb(SCAN_TO_SUBSTR);
2631 
2632     // SP saved at sp+0
2633     // String saved at sp+1*wordSize
2634     // Substr saved at sp+2*wordSize
2635     // Substr count saved at sp+3*wordSize
2636 
2637     // Reload substr for rescan, this code
2638     // is executed only for large substrings (> 8 chars)
2639     bind(RELOAD_SUBSTR);
2640     movptr(str2, Address(rsp, 2*wordSize));
2641     movl(cnt2, Address(rsp, 3*wordSize));
2642     if (ae == StrIntrinsicNode::UL) {
2643       pmovzxbw(vec, Address(str2, 0));
2644     } else {
2645       movdqu(vec, Address(str2, 0));
2646     }
2647     // We came here after the beginning of the substring was
2648     // matched but the rest of it was not so we need to search
2649     // again. Start from the next element after the previous match.
2650     subptr(str1, result); // Restore counter
2651     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2652       shrl(str1, 1);
2653     }
2654     addl(cnt1, str1);
2655     decrementl(cnt1);   // Shift to next element
2656     cmpl(cnt1, cnt2);
2657     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2658 
2659     addptr(result, (1<<scale1));
2660   } // non constant
2661 
2662   // Scan string for start of substr in 16-byte vectors
2663   bind(SCAN_TO_SUBSTR);
2664   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2665   pcmpestri(vec, Address(result, 0), mode);
2666   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2667   subl(cnt1, stride);
2668   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2669   cmpl(cnt1, cnt2);
2670   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2671   addptr(result, 16);
2672 
2673   bind(ADJUST_STR);
2674   cmpl(cnt1, stride); // Do not read beyond string
2675   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2676   // Back-up string to avoid reading beyond string.
2677   lea(result, Address(result, cnt1, scale1, -16));
2678   movl(cnt1, stride);
2679   jmpb(SCAN_TO_SUBSTR);
2680 
2681   // Found a potential substr
2682   bind(FOUND_CANDIDATE);
2683   // After pcmpestri tmp(rcx) contains matched element index
2684 
2685   // Make sure string is still long enough
2686   subl(cnt1, tmp);
2687   cmpl(cnt1, cnt2);
2688   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
2689   // Left less then substring.
2690 
2691   bind(RET_NOT_FOUND);
2692   movl(result, -1);
2693   jmp(CLEANUP);
2694 
2695   bind(FOUND_SUBSTR);
2696   // Compute start addr of substr
2697   lea(result, Address(result, tmp, scale1));
2698   if (int_cnt2 > 0) { // Constant substring
2699     // Repeat search for small substring (< 8 chars)
2700     // from new point without reloading substring.
2701     // Have to check that we don't read beyond string.
2702     cmpl(tmp, stride-int_cnt2);
2703     jccb(Assembler::greater, ADJUST_STR);
2704     // Fall through if matched whole substring.
2705   } else { // non constant
2706     assert(int_cnt2 == -1, "should be != 0");
2707 
2708     addl(tmp, cnt2);
2709     // Found result if we matched whole substring.
2710     cmpl(tmp, stride);
2711     jcc(Assembler::lessEqual, RET_FOUND);
2712 
2713     // Repeat search for small substring (<= 8 chars)
2714     // from new point 'str1' without reloading substring.
2715     cmpl(cnt2, stride);
2716     // Have to check that we don't read beyond string.
2717     jccb(Assembler::lessEqual, ADJUST_STR);
2718 
2719     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
2720     // Compare the rest of substring (> 8 chars).
2721     movptr(str1, result);
2722 
2723     cmpl(tmp, cnt2);
2724     // First 8 chars are already matched.
2725     jccb(Assembler::equal, CHECK_NEXT);
2726 
2727     bind(SCAN_SUBSTR);
2728     pcmpestri(vec, Address(str1, 0), mode);
2729     // Need to reload strings pointers if not matched whole vector
2730     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2731 
2732     bind(CHECK_NEXT);
2733     subl(cnt2, stride);
2734     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
2735     addptr(str1, 16);
2736     if (ae == StrIntrinsicNode::UL) {
2737       addptr(str2, 8);
2738     } else {
2739       addptr(str2, 16);
2740     }
2741     subl(cnt1, stride);
2742     cmpl(cnt2, stride); // Do not read beyond substring
2743     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
2744     // Back-up strings to avoid reading beyond substring.
2745 
2746     if (ae == StrIntrinsicNode::UL) {
2747       lea(str2, Address(str2, cnt2, scale2, -8));
2748       lea(str1, Address(str1, cnt2, scale1, -16));
2749     } else {
2750       lea(str2, Address(str2, cnt2, scale2, -16));
2751       lea(str1, Address(str1, cnt2, scale1, -16));
2752     }
2753     subl(cnt1, cnt2);
2754     movl(cnt2, stride);
2755     addl(cnt1, stride);
2756     bind(CONT_SCAN_SUBSTR);
2757     if (ae == StrIntrinsicNode::UL) {
2758       pmovzxbw(vec, Address(str2, 0));
2759     } else {
2760       movdqu(vec, Address(str2, 0));
2761     }
2762     jmp(SCAN_SUBSTR);
2763 
2764     bind(RET_FOUND_LONG);
2765     movptr(str1, Address(rsp, wordSize));
2766   } // non constant
2767 
2768   bind(RET_FOUND);
2769   // Compute substr offset
2770   subptr(result, str1);
2771   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2772     shrl(result, 1); // index
2773   }
2774   bind(CLEANUP);
2775   pop(rsp); // restore SP
2776 
2777 } // string_indexof
2778 
2779 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
2780                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
2781   ShortBranchVerifier sbv(this);
2782   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2783 
2784   int stride = 8;
2785 
2786   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
2787         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
2788         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
2789         FOUND_SEQ_CHAR, DONE_LABEL;
2790 
2791   movptr(result, str1);
2792   if (UseAVX >= 2) {
2793     cmpl(cnt1, stride);
2794     jcc(Assembler::less, SCAN_TO_CHAR);
2795     cmpl(cnt1, 2*stride);
2796     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
2797     movdl(vec1, ch);
2798     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
2799     vpxor(vec2, vec2);
2800     movl(tmp, cnt1);
2801     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
2802     andl(cnt1,0x0000000F);  //tail count (in chars)
2803 
2804     bind(SCAN_TO_16_CHAR_LOOP);
2805     vmovdqu(vec3, Address(result, 0));
2806     vpcmpeqw(vec3, vec3, vec1, 1);
2807     vptest(vec2, vec3);
2808     jcc(Assembler::carryClear, FOUND_CHAR);
2809     addptr(result, 32);
2810     subl(tmp, 2*stride);
2811     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
2812     jmp(SCAN_TO_8_CHAR);
2813     bind(SCAN_TO_8_CHAR_INIT);
2814     movdl(vec1, ch);
2815     pshuflw(vec1, vec1, 0x00);
2816     pshufd(vec1, vec1, 0);
2817     pxor(vec2, vec2);
2818   }
2819   bind(SCAN_TO_8_CHAR);
2820   cmpl(cnt1, stride);
2821   jcc(Assembler::less, SCAN_TO_CHAR);
2822   if (UseAVX < 2) {
2823     movdl(vec1, ch);
2824     pshuflw(vec1, vec1, 0x00);
2825     pshufd(vec1, vec1, 0);
2826     pxor(vec2, vec2);
2827   }
2828   movl(tmp, cnt1);
2829   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
2830   andl(cnt1,0x00000007);  //tail count (in chars)
2831 
2832   bind(SCAN_TO_8_CHAR_LOOP);
2833   movdqu(vec3, Address(result, 0));
2834   pcmpeqw(vec3, vec1);
2835   ptest(vec2, vec3);
2836   jcc(Assembler::carryClear, FOUND_CHAR);
2837   addptr(result, 16);
2838   subl(tmp, stride);
2839   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
2840   bind(SCAN_TO_CHAR);
2841   testl(cnt1, cnt1);
2842   jcc(Assembler::zero, RET_NOT_FOUND);
2843   bind(SCAN_TO_CHAR_LOOP);
2844   load_unsigned_short(tmp, Address(result, 0));
2845   cmpl(ch, tmp);
2846   jccb(Assembler::equal, FOUND_SEQ_CHAR);
2847   addptr(result, 2);
2848   subl(cnt1, 1);
2849   jccb(Assembler::zero, RET_NOT_FOUND);
2850   jmp(SCAN_TO_CHAR_LOOP);
2851 
2852   bind(RET_NOT_FOUND);
2853   movl(result, -1);
2854   jmpb(DONE_LABEL);
2855 
2856   bind(FOUND_CHAR);
2857   if (UseAVX >= 2) {
2858     vpmovmskb(tmp, vec3);
2859   } else {
2860     pmovmskb(tmp, vec3);
2861   }
2862   bsfl(ch, tmp);
2863   addptr(result, ch);
2864 
2865   bind(FOUND_SEQ_CHAR);
2866   subptr(result, str1);
2867   shrl(result, 1);
2868 
2869   bind(DONE_LABEL);
2870 } // string_indexof_char
2871 
2872 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
2873                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
2874   ShortBranchVerifier sbv(this);
2875   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2876 
2877   int stride = 16;
2878 
2879   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
2880         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
2881         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
2882         FOUND_SEQ_CHAR, DONE_LABEL;
2883 
2884   movptr(result, str1);
2885   if (UseAVX >= 2) {
2886     cmpl(cnt1, stride);
2887     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
2888     cmpl(cnt1, stride*2);
2889     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
2890     movdl(vec1, ch);
2891     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
2892     vpxor(vec2, vec2);
2893     movl(tmp, cnt1);
2894     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
2895     andl(cnt1,0x0000001F);  //tail count (in chars)
2896 
2897     bind(SCAN_TO_32_CHAR_LOOP);
2898     vmovdqu(vec3, Address(result, 0));
2899     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
2900     vptest(vec2, vec3);
2901     jcc(Assembler::carryClear, FOUND_CHAR);
2902     addptr(result, 32);
2903     subl(tmp, stride*2);
2904     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
2905     jmp(SCAN_TO_16_CHAR);
2906 
2907     bind(SCAN_TO_16_CHAR_INIT);
2908     movdl(vec1, ch);
2909     pxor(vec2, vec2);
2910     pshufb(vec1, vec2);
2911   }
2912 
2913   bind(SCAN_TO_16_CHAR);
2914   cmpl(cnt1, stride);
2915   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
2916   if (UseAVX < 2) {
2917     movdl(vec1, ch);
2918     pxor(vec2, vec2);
2919     pshufb(vec1, vec2);
2920   }
2921   movl(tmp, cnt1);
2922   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
2923   andl(cnt1,0x0000000F);  //tail count (in bytes)
2924 
2925   bind(SCAN_TO_16_CHAR_LOOP);
2926   movdqu(vec3, Address(result, 0));
2927   pcmpeqb(vec3, vec1);
2928   ptest(vec2, vec3);
2929   jcc(Assembler::carryClear, FOUND_CHAR);
2930   addptr(result, 16);
2931   subl(tmp, stride);
2932   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
2933 
2934   bind(SCAN_TO_CHAR_INIT);
2935   testl(cnt1, cnt1);
2936   jcc(Assembler::zero, RET_NOT_FOUND);
2937   bind(SCAN_TO_CHAR_LOOP);
2938   load_unsigned_byte(tmp, Address(result, 0));
2939   cmpl(ch, tmp);
2940   jccb(Assembler::equal, FOUND_SEQ_CHAR);
2941   addptr(result, 1);
2942   subl(cnt1, 1);
2943   jccb(Assembler::zero, RET_NOT_FOUND);
2944   jmp(SCAN_TO_CHAR_LOOP);
2945 
2946   bind(RET_NOT_FOUND);
2947   movl(result, -1);
2948   jmpb(DONE_LABEL);
2949 
2950   bind(FOUND_CHAR);
2951   if (UseAVX >= 2) {
2952     vpmovmskb(tmp, vec3);
2953   } else {
2954     pmovmskb(tmp, vec3);
2955   }
2956   bsfl(ch, tmp);
2957   addptr(result, ch);
2958 
2959   bind(FOUND_SEQ_CHAR);
2960   subptr(result, str1);
2961 
2962   bind(DONE_LABEL);
2963 } // stringL_indexof_char
2964 
2965 // helper function for string_compare
2966 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
2967                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
2968                                            Address::ScaleFactor scale2, Register index, int ae) {
2969   if (ae == StrIntrinsicNode::LL) {
2970     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
2971     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
2972   } else if (ae == StrIntrinsicNode::UU) {
2973     load_unsigned_short(elem1, Address(str1, index, scale, 0));
2974     load_unsigned_short(elem2, Address(str2, index, scale, 0));
2975   } else {
2976     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
2977     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
2978   }
2979 }
2980 
2981 // Compare strings, used for char[] and byte[].
2982 void C2_MacroAssembler::string_compare(Register str1, Register str2,
2983                                        Register cnt1, Register cnt2, Register result,
2984                                        XMMRegister vec1, int ae, KRegister mask) {
2985   ShortBranchVerifier sbv(this);
2986   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
2987   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
2988   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
2989   int stride2x2 = 0x40;
2990   Address::ScaleFactor scale = Address::no_scale;
2991   Address::ScaleFactor scale1 = Address::no_scale;
2992   Address::ScaleFactor scale2 = Address::no_scale;
2993 
2994   if (ae != StrIntrinsicNode::LL) {
2995     stride2x2 = 0x20;
2996   }
2997 
2998   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
2999     shrl(cnt2, 1);
3000   }
3001   // Compute the minimum of the string lengths and the
3002   // difference of the string lengths (stack).
3003   // Do the conditional move stuff
3004   movl(result, cnt1);
3005   subl(cnt1, cnt2);
3006   push(cnt1);
3007   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3008 
3009   // Is the minimum length zero?
3010   testl(cnt2, cnt2);
3011   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3012   if (ae == StrIntrinsicNode::LL) {
3013     // Load first bytes
3014     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3015     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3016   } else if (ae == StrIntrinsicNode::UU) {
3017     // Load first characters
3018     load_unsigned_short(result, Address(str1, 0));
3019     load_unsigned_short(cnt1, Address(str2, 0));
3020   } else {
3021     load_unsigned_byte(result, Address(str1, 0));
3022     load_unsigned_short(cnt1, Address(str2, 0));
3023   }
3024   subl(result, cnt1);
3025   jcc(Assembler::notZero,  POP_LABEL);
3026 
3027   if (ae == StrIntrinsicNode::UU) {
3028     // Divide length by 2 to get number of chars
3029     shrl(cnt2, 1);
3030   }
3031   cmpl(cnt2, 1);
3032   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3033 
3034   // Check if the strings start at the same location and setup scale and stride
3035   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3036     cmpptr(str1, str2);
3037     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3038     if (ae == StrIntrinsicNode::LL) {
3039       scale = Address::times_1;
3040       stride = 16;
3041     } else {
3042       scale = Address::times_2;
3043       stride = 8;
3044     }
3045   } else {
3046     scale1 = Address::times_1;
3047     scale2 = Address::times_2;
3048     // scale not used
3049     stride = 8;
3050   }
3051 
3052   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3053     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3054     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3055     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3056     Label COMPARE_TAIL_LONG;
3057     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3058 
3059     int pcmpmask = 0x19;
3060     if (ae == StrIntrinsicNode::LL) {
3061       pcmpmask &= ~0x01;
3062     }
3063 
3064     // Setup to compare 16-chars (32-bytes) vectors,
3065     // start from first character again because it has aligned address.
3066     if (ae == StrIntrinsicNode::LL) {
3067       stride2 = 32;
3068     } else {
3069       stride2 = 16;
3070     }
3071     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3072       adr_stride = stride << scale;
3073     } else {
3074       adr_stride1 = 8;  //stride << scale1;
3075       adr_stride2 = 16; //stride << scale2;
3076     }
3077 
3078     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3079     // rax and rdx are used by pcmpestri as elements counters
3080     movl(result, cnt2);
3081     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3082     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3083 
3084     // fast path : compare first 2 8-char vectors.
3085     bind(COMPARE_16_CHARS);
3086     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3087       movdqu(vec1, Address(str1, 0));
3088     } else {
3089       pmovzxbw(vec1, Address(str1, 0));
3090     }
3091     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3092     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3093 
3094     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3095       movdqu(vec1, Address(str1, adr_stride));
3096       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3097     } else {
3098       pmovzxbw(vec1, Address(str1, adr_stride1));
3099       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3100     }
3101     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3102     addl(cnt1, stride);
3103 
3104     // Compare the characters at index in cnt1
3105     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3106     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3107     subl(result, cnt2);
3108     jmp(POP_LABEL);
3109 
3110     // Setup the registers to start vector comparison loop
3111     bind(COMPARE_WIDE_VECTORS);
3112     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3113       lea(str1, Address(str1, result, scale));
3114       lea(str2, Address(str2, result, scale));
3115     } else {
3116       lea(str1, Address(str1, result, scale1));
3117       lea(str2, Address(str2, result, scale2));
3118     }
3119     subl(result, stride2);
3120     subl(cnt2, stride2);
3121     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3122     negptr(result);
3123 
3124     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3125     bind(COMPARE_WIDE_VECTORS_LOOP);
3126 
3127 #ifdef _LP64
3128     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3129       cmpl(cnt2, stride2x2);
3130       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3131       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3132       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3133 
3134       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3135       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3136         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3137         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3138       } else {
3139         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3140         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3141       }
3142       kortestql(mask, mask);
3143       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3144       addptr(result, stride2x2);  // update since we already compared at this addr
3145       subl(cnt2, stride2x2);      // and sub the size too
3146       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3147 
3148       vpxor(vec1, vec1);
3149       jmpb(COMPARE_WIDE_TAIL);
3150     }//if (VM_Version::supports_avx512vlbw())
3151 #endif // _LP64
3152 
3153 
3154     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3155     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3156       vmovdqu(vec1, Address(str1, result, scale));
3157       vpxor(vec1, Address(str2, result, scale));
3158     } else {
3159       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3160       vpxor(vec1, Address(str2, result, scale2));
3161     }
3162     vptest(vec1, vec1);
3163     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3164     addptr(result, stride2);
3165     subl(cnt2, stride2);
3166     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3167     // clean upper bits of YMM registers
3168     vpxor(vec1, vec1);
3169 
3170     // compare wide vectors tail
3171     bind(COMPARE_WIDE_TAIL);
3172     testptr(result, result);
3173     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3174 
3175     movl(result, stride2);
3176     movl(cnt2, result);
3177     negptr(result);
3178     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3179 
3180     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3181     bind(VECTOR_NOT_EQUAL);
3182     // clean upper bits of YMM registers
3183     vpxor(vec1, vec1);
3184     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3185       lea(str1, Address(str1, result, scale));
3186       lea(str2, Address(str2, result, scale));
3187     } else {
3188       lea(str1, Address(str1, result, scale1));
3189       lea(str2, Address(str2, result, scale2));
3190     }
3191     jmp(COMPARE_16_CHARS);
3192 
3193     // Compare tail chars, length between 1 to 15 chars
3194     bind(COMPARE_TAIL_LONG);
3195     movl(cnt2, result);
3196     cmpl(cnt2, stride);
3197     jcc(Assembler::less, COMPARE_SMALL_STR);
3198 
3199     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3200       movdqu(vec1, Address(str1, 0));
3201     } else {
3202       pmovzxbw(vec1, Address(str1, 0));
3203     }
3204     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3205     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3206     subptr(cnt2, stride);
3207     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3208     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3209       lea(str1, Address(str1, result, scale));
3210       lea(str2, Address(str2, result, scale));
3211     } else {
3212       lea(str1, Address(str1, result, scale1));
3213       lea(str2, Address(str2, result, scale2));
3214     }
3215     negptr(cnt2);
3216     jmpb(WHILE_HEAD_LABEL);
3217 
3218     bind(COMPARE_SMALL_STR);
3219   } else if (UseSSE42Intrinsics) {
3220     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3221     int pcmpmask = 0x19;
3222     // Setup to compare 8-char (16-byte) vectors,
3223     // start from first character again because it has aligned address.
3224     movl(result, cnt2);
3225     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3226     if (ae == StrIntrinsicNode::LL) {
3227       pcmpmask &= ~0x01;
3228     }
3229     jcc(Assembler::zero, COMPARE_TAIL);
3230     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3231       lea(str1, Address(str1, result, scale));
3232       lea(str2, Address(str2, result, scale));
3233     } else {
3234       lea(str1, Address(str1, result, scale1));
3235       lea(str2, Address(str2, result, scale2));
3236     }
3237     negptr(result);
3238 
3239     // pcmpestri
3240     //   inputs:
3241     //     vec1- substring
3242     //     rax - negative string length (elements count)
3243     //     mem - scanned string
3244     //     rdx - string length (elements count)
3245     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3246     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3247     //   outputs:
3248     //     rcx - first mismatched element index
3249     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3250 
3251     bind(COMPARE_WIDE_VECTORS);
3252     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3253       movdqu(vec1, Address(str1, result, scale));
3254       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3255     } else {
3256       pmovzxbw(vec1, Address(str1, result, scale1));
3257       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3258     }
3259     // After pcmpestri cnt1(rcx) contains mismatched element index
3260 
3261     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3262     addptr(result, stride);
3263     subptr(cnt2, stride);
3264     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3265 
3266     // compare wide vectors tail
3267     testptr(result, result);
3268     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3269 
3270     movl(cnt2, stride);
3271     movl(result, stride);
3272     negptr(result);
3273     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3274       movdqu(vec1, Address(str1, result, scale));
3275       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3276     } else {
3277       pmovzxbw(vec1, Address(str1, result, scale1));
3278       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3279     }
3280     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3281 
3282     // Mismatched characters in the vectors
3283     bind(VECTOR_NOT_EQUAL);
3284     addptr(cnt1, result);
3285     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3286     subl(result, cnt2);
3287     jmpb(POP_LABEL);
3288 
3289     bind(COMPARE_TAIL); // limit is zero
3290     movl(cnt2, result);
3291     // Fallthru to tail compare
3292   }
3293   // Shift str2 and str1 to the end of the arrays, negate min
3294   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3295     lea(str1, Address(str1, cnt2, scale));
3296     lea(str2, Address(str2, cnt2, scale));
3297   } else {
3298     lea(str1, Address(str1, cnt2, scale1));
3299     lea(str2, Address(str2, cnt2, scale2));
3300   }
3301   decrementl(cnt2);  // first character was compared already
3302   negptr(cnt2);
3303 
3304   // Compare the rest of the elements
3305   bind(WHILE_HEAD_LABEL);
3306   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3307   subl(result, cnt1);
3308   jccb(Assembler::notZero, POP_LABEL);
3309   increment(cnt2);
3310   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3311 
3312   // Strings are equal up to min length.  Return the length difference.
3313   bind(LENGTH_DIFF_LABEL);
3314   pop(result);
3315   if (ae == StrIntrinsicNode::UU) {
3316     // Divide diff by 2 to get number of chars
3317     sarl(result, 1);
3318   }
3319   jmpb(DONE_LABEL);
3320 
3321 #ifdef _LP64
3322   if (VM_Version::supports_avx512vlbw()) {
3323 
3324     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3325 
3326     kmovql(cnt1, mask);
3327     notq(cnt1);
3328     bsfq(cnt2, cnt1);
3329     if (ae != StrIntrinsicNode::LL) {
3330       // Divide diff by 2 to get number of chars
3331       sarl(cnt2, 1);
3332     }
3333     addq(result, cnt2);
3334     if (ae == StrIntrinsicNode::LL) {
3335       load_unsigned_byte(cnt1, Address(str2, result));
3336       load_unsigned_byte(result, Address(str1, result));
3337     } else if (ae == StrIntrinsicNode::UU) {
3338       load_unsigned_short(cnt1, Address(str2, result, scale));
3339       load_unsigned_short(result, Address(str1, result, scale));
3340     } else {
3341       load_unsigned_short(cnt1, Address(str2, result, scale2));
3342       load_unsigned_byte(result, Address(str1, result, scale1));
3343     }
3344     subl(result, cnt1);
3345     jmpb(POP_LABEL);
3346   }//if (VM_Version::supports_avx512vlbw())
3347 #endif // _LP64
3348 
3349   // Discard the stored length difference
3350   bind(POP_LABEL);
3351   pop(cnt1);
3352 
3353   // That's it
3354   bind(DONE_LABEL);
3355   if(ae == StrIntrinsicNode::UL) {
3356     negl(result);
3357   }
3358 
3359 }
3360 
3361 // Search for Non-ASCII character (Negative byte value) in a byte array,
3362 // return the index of the first such character, otherwise the length
3363 // of the array segment searched.
3364 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3365 //   @IntrinsicCandidate
3366 //   public static int countPositives(byte[] ba, int off, int len) {
3367 //     for (int i = off; i < off + len; i++) {
3368 //       if (ba[i] < 0) {
3369 //         return i - off;
3370 //       }
3371 //     }
3372 //     return len;
3373 //   }
3374 void C2_MacroAssembler::count_positives(Register ary1, Register len,
3375   Register result, Register tmp1,
3376   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3377   // rsi: byte array
3378   // rcx: len
3379   // rax: result
3380   ShortBranchVerifier sbv(this);
3381   assert_different_registers(ary1, len, result, tmp1);
3382   assert_different_registers(vec1, vec2);
3383   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3384 
3385   movl(result, len); // copy
3386   // len == 0
3387   testl(len, len);
3388   jcc(Assembler::zero, DONE);
3389 
3390   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3391     VM_Version::supports_avx512vlbw() &&
3392     VM_Version::supports_bmi2()) {
3393 
3394     Label test_64_loop, test_tail, BREAK_LOOP;
3395     Register tmp3_aliased = len;
3396 
3397     movl(tmp1, len);
3398     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3399 
3400     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
3401     andl(len, ~(64 - 1));    // vector count (in chars)
3402     jccb(Assembler::zero, test_tail);
3403 
3404     lea(ary1, Address(ary1, len, Address::times_1));
3405     negptr(len);
3406 
3407     bind(test_64_loop);
3408     // Check whether our 64 elements of size byte contain negatives
3409     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3410     kortestql(mask1, mask1);
3411     jcc(Assembler::notZero, BREAK_LOOP);
3412 
3413     addptr(len, 64);
3414     jccb(Assembler::notZero, test_64_loop);
3415 
3416     bind(test_tail);
3417     // bail out when there is nothing to be done
3418     testl(tmp1, -1);
3419     jcc(Assembler::zero, DONE);
3420 
3421     // ~(~0 << len) applied up to two times (for 32-bit scenario)
3422 #ifdef _LP64
3423     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3424     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3425     notq(tmp3_aliased);
3426     kmovql(mask2, tmp3_aliased);
3427 #else
3428     Label k_init;
3429     jmp(k_init);
3430 
3431     // We could not read 64-bits from a general purpose register thus we move
3432     // data required to compose 64 1's to the instruction stream
3433     // We emit 64 byte wide series of elements from 0..63 which later on would
3434     // be used as a compare targets with tail count contained in tmp1 register.
3435     // Result would be a k register having tmp1 consecutive number or 1
3436     // counting from least significant bit.
3437     address tmp = pc();
3438     emit_int64(0x0706050403020100);
3439     emit_int64(0x0F0E0D0C0B0A0908);
3440     emit_int64(0x1716151413121110);
3441     emit_int64(0x1F1E1D1C1B1A1918);
3442     emit_int64(0x2726252423222120);
3443     emit_int64(0x2F2E2D2C2B2A2928);
3444     emit_int64(0x3736353433323130);
3445     emit_int64(0x3F3E3D3C3B3A3938);
3446 
3447     bind(k_init);
3448     lea(len, InternalAddress(tmp));
3449     // create mask to test for negative byte inside a vector
3450     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3451     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
3452 
3453 #endif
3454     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3455     ktestq(mask1, mask2);
3456     jcc(Assembler::zero, DONE);
3457 
3458     bind(BREAK_LOOP);
3459     // At least one byte in the last 64 bytes is negative.
3460     // Set up to look at the last 64 bytes as if they were a tail
3461     lea(ary1, Address(ary1, len, Address::times_1));
3462     addptr(result, len);
3463     // Ignore the very last byte: if all others are positive,
3464     // it must be negative, so we can skip right to the 2+1 byte
3465     // end comparison at this point
3466     orl(result, 63);
3467     movl(len, 63);
3468     // Fallthru to tail compare
3469   } else {
3470 
3471     if (UseAVX >= 2 && UseSSE >= 2) {
3472       // With AVX2, use 32-byte vector compare
3473       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
3474 
3475       // Compare 32-byte vectors
3476       testl(len, 0xffffffe0);   // vector count (in bytes)
3477       jccb(Assembler::zero, TAIL_START);
3478 
3479       andl(len, 0xffffffe0);
3480       lea(ary1, Address(ary1, len, Address::times_1));
3481       negptr(len);
3482 
3483       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
3484       movdl(vec2, tmp1);
3485       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
3486 
3487       bind(COMPARE_WIDE_VECTORS);
3488       vmovdqu(vec1, Address(ary1, len, Address::times_1));
3489       vptest(vec1, vec2);
3490       jccb(Assembler::notZero, BREAK_LOOP);
3491       addptr(len, 32);
3492       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3493 
3494       testl(result, 0x0000001f);   // any bytes remaining?
3495       jcc(Assembler::zero, DONE);
3496 
3497       // Quick test using the already prepared vector mask
3498       movl(len, result);
3499       andl(len, 0x0000001f);
3500       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
3501       vptest(vec1, vec2);
3502       jcc(Assembler::zero, DONE);
3503       // There are zeros, jump to the tail to determine exactly where
3504       jmpb(TAIL_START);
3505 
3506       bind(BREAK_LOOP);
3507       // At least one byte in the last 32-byte vector is negative.
3508       // Set up to look at the last 32 bytes as if they were a tail
3509       lea(ary1, Address(ary1, len, Address::times_1));
3510       addptr(result, len);
3511       // Ignore the very last byte: if all others are positive,
3512       // it must be negative, so we can skip right to the 2+1 byte
3513       // end comparison at this point
3514       orl(result, 31);
3515       movl(len, 31);
3516       // Fallthru to tail compare
3517     } else if (UseSSE42Intrinsics) {
3518       // With SSE4.2, use double quad vector compare
3519       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
3520 
3521       // Compare 16-byte vectors
3522       testl(len, 0xfffffff0);   // vector count (in bytes)
3523       jcc(Assembler::zero, TAIL_START);
3524 
3525       andl(len, 0xfffffff0);
3526       lea(ary1, Address(ary1, len, Address::times_1));
3527       negptr(len);
3528 
3529       movl(tmp1, 0x80808080);
3530       movdl(vec2, tmp1);
3531       pshufd(vec2, vec2, 0);
3532 
3533       bind(COMPARE_WIDE_VECTORS);
3534       movdqu(vec1, Address(ary1, len, Address::times_1));
3535       ptest(vec1, vec2);
3536       jccb(Assembler::notZero, BREAK_LOOP);
3537       addptr(len, 16);
3538       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3539 
3540       testl(result, 0x0000000f); // len is zero, any bytes remaining?
3541       jcc(Assembler::zero, DONE);
3542 
3543       // Quick test using the already prepared vector mask
3544       movl(len, result);
3545       andl(len, 0x0000000f);   // tail count (in bytes)
3546       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
3547       ptest(vec1, vec2);
3548       jcc(Assembler::zero, DONE);
3549       jmpb(TAIL_START);
3550 
3551       bind(BREAK_LOOP);
3552       // At least one byte in the last 16-byte vector is negative.
3553       // Set up and look at the last 16 bytes as if they were a tail
3554       lea(ary1, Address(ary1, len, Address::times_1));
3555       addptr(result, len);
3556       // Ignore the very last byte: if all others are positive,
3557       // it must be negative, so we can skip right to the 2+1 byte
3558       // end comparison at this point
3559       orl(result, 15);
3560       movl(len, 15);
3561       // Fallthru to tail compare
3562     }
3563   }
3564 
3565   bind(TAIL_START);
3566   // Compare 4-byte vectors
3567   andl(len, 0xfffffffc); // vector count (in bytes)
3568   jccb(Assembler::zero, COMPARE_CHAR);
3569 
3570   lea(ary1, Address(ary1, len, Address::times_1));
3571   negptr(len);
3572 
3573   bind(COMPARE_VECTORS);
3574   movl(tmp1, Address(ary1, len, Address::times_1));
3575   andl(tmp1, 0x80808080);
3576   jccb(Assembler::notZero, TAIL_ADJUST);
3577   addptr(len, 4);
3578   jccb(Assembler::notZero, COMPARE_VECTORS);
3579 
3580   // Compare trailing char (final 2-3 bytes), if any
3581   bind(COMPARE_CHAR);
3582 
3583   testl(result, 0x2);   // tail  char
3584   jccb(Assembler::zero, COMPARE_BYTE);
3585   load_unsigned_short(tmp1, Address(ary1, 0));
3586   andl(tmp1, 0x00008080);
3587   jccb(Assembler::notZero, CHAR_ADJUST);
3588   lea(ary1, Address(ary1, 2));
3589 
3590   bind(COMPARE_BYTE);
3591   testl(result, 0x1);   // tail  byte
3592   jccb(Assembler::zero, DONE);
3593   load_unsigned_byte(tmp1, Address(ary1, 0));
3594   testl(tmp1, 0x00000080);
3595   jccb(Assembler::zero, DONE);
3596   subptr(result, 1);
3597   jmpb(DONE);
3598 
3599   bind(TAIL_ADJUST);
3600   // there are negative bits in the last 4 byte block.
3601   // Adjust result and check the next three bytes
3602   addptr(result, len);
3603   orl(result, 3);
3604   lea(ary1, Address(ary1, len, Address::times_1));
3605   jmpb(COMPARE_CHAR);
3606 
3607   bind(CHAR_ADJUST);
3608   // We are looking at a char + optional byte tail, and found that one
3609   // of the bytes in the char is negative. Adjust the result, check the
3610   // first byte and readjust if needed.
3611   andl(result, 0xfffffffc);
3612   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
3613   jccb(Assembler::notZero, DONE);
3614   addptr(result, 1);
3615 
3616   // That's it
3617   bind(DONE);
3618   if (UseAVX >= 2 && UseSSE >= 2) {
3619     // clean upper bits of YMM registers
3620     vpxor(vec1, vec1);
3621     vpxor(vec2, vec2);
3622   }
3623 }
3624 
3625 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
3626 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
3627                                       Register limit, Register result, Register chr,
3628                                       XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
3629   ShortBranchVerifier sbv(this);
3630   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
3631 
3632   int length_offset  = arrayOopDesc::length_offset_in_bytes();
3633   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
3634 
3635   if (is_array_equ) {
3636     // Check the input args
3637     cmpoop(ary1, ary2);
3638     jcc(Assembler::equal, TRUE_LABEL);
3639 
3640     // Need additional checks for arrays_equals.
3641     testptr(ary1, ary1);
3642     jcc(Assembler::zero, FALSE_LABEL);
3643     testptr(ary2, ary2);
3644     jcc(Assembler::zero, FALSE_LABEL);
3645 
3646     // Check the lengths
3647     movl(limit, Address(ary1, length_offset));
3648     cmpl(limit, Address(ary2, length_offset));
3649     jcc(Assembler::notEqual, FALSE_LABEL);
3650   }
3651 
3652   // count == 0
3653   testl(limit, limit);
3654   jcc(Assembler::zero, TRUE_LABEL);
3655 
3656   if (is_array_equ) {
3657     // Load array address
3658     lea(ary1, Address(ary1, base_offset));
3659     lea(ary2, Address(ary2, base_offset));
3660   }
3661 
3662   if (is_array_equ && is_char) {
3663     // arrays_equals when used for char[].
3664     shll(limit, 1);      // byte count != 0
3665   }
3666   movl(result, limit); // copy
3667 
3668   if (UseAVX >= 2) {
3669     // With AVX2, use 32-byte vector compare
3670     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3671 
3672     // Compare 32-byte vectors
3673     andl(result, 0x0000001f);  //   tail count (in bytes)
3674     andl(limit, 0xffffffe0);   // vector count (in bytes)
3675     jcc(Assembler::zero, COMPARE_TAIL);
3676 
3677     lea(ary1, Address(ary1, limit, Address::times_1));
3678     lea(ary2, Address(ary2, limit, Address::times_1));
3679     negptr(limit);
3680 
3681 #ifdef _LP64
3682     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3683       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
3684 
3685       cmpl(limit, -64);
3686       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3687 
3688       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3689 
3690       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
3691       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
3692       kortestql(mask, mask);
3693       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3694       addptr(limit, 64);  // update since we already compared at this addr
3695       cmpl(limit, -64);
3696       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3697 
3698       // At this point we may still need to compare -limit+result bytes.
3699       // We could execute the next two instruction and just continue via non-wide path:
3700       //  cmpl(limit, 0);
3701       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
3702       // But since we stopped at the points ary{1,2}+limit which are
3703       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
3704       // (|limit| <= 32 and result < 32),
3705       // we may just compare the last 64 bytes.
3706       //
3707       addptr(result, -64);   // it is safe, bc we just came from this area
3708       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
3709       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
3710       kortestql(mask, mask);
3711       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3712 
3713       jmp(TRUE_LABEL);
3714 
3715       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3716 
3717     }//if (VM_Version::supports_avx512vlbw())
3718 #endif //_LP64
3719     bind(COMPARE_WIDE_VECTORS);
3720     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
3721     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
3722     vpxor(vec1, vec2);
3723 
3724     vptest(vec1, vec1);
3725     jcc(Assembler::notZero, FALSE_LABEL);
3726     addptr(limit, 32);
3727     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3728 
3729     testl(result, result);
3730     jcc(Assembler::zero, TRUE_LABEL);
3731 
3732     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3733     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
3734     vpxor(vec1, vec2);
3735 
3736     vptest(vec1, vec1);
3737     jccb(Assembler::notZero, FALSE_LABEL);
3738     jmpb(TRUE_LABEL);
3739 
3740     bind(COMPARE_TAIL); // limit is zero
3741     movl(limit, result);
3742     // Fallthru to tail compare
3743   } else if (UseSSE42Intrinsics) {
3744     // With SSE4.2, use double quad vector compare
3745     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3746 
3747     // Compare 16-byte vectors
3748     andl(result, 0x0000000f);  //   tail count (in bytes)
3749     andl(limit, 0xfffffff0);   // vector count (in bytes)
3750     jcc(Assembler::zero, COMPARE_TAIL);
3751 
3752     lea(ary1, Address(ary1, limit, Address::times_1));
3753     lea(ary2, Address(ary2, limit, Address::times_1));
3754     negptr(limit);
3755 
3756     bind(COMPARE_WIDE_VECTORS);
3757     movdqu(vec1, Address(ary1, limit, Address::times_1));
3758     movdqu(vec2, Address(ary2, limit, Address::times_1));
3759     pxor(vec1, vec2);
3760 
3761     ptest(vec1, vec1);
3762     jcc(Assembler::notZero, FALSE_LABEL);
3763     addptr(limit, 16);
3764     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3765 
3766     testl(result, result);
3767     jcc(Assembler::zero, TRUE_LABEL);
3768 
3769     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3770     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
3771     pxor(vec1, vec2);
3772 
3773     ptest(vec1, vec1);
3774     jccb(Assembler::notZero, FALSE_LABEL);
3775     jmpb(TRUE_LABEL);
3776 
3777     bind(COMPARE_TAIL); // limit is zero
3778     movl(limit, result);
3779     // Fallthru to tail compare
3780   }
3781 
3782   // Compare 4-byte vectors
3783   andl(limit, 0xfffffffc); // vector count (in bytes)
3784   jccb(Assembler::zero, COMPARE_CHAR);
3785 
3786   lea(ary1, Address(ary1, limit, Address::times_1));
3787   lea(ary2, Address(ary2, limit, Address::times_1));
3788   negptr(limit);
3789 
3790   bind(COMPARE_VECTORS);
3791   movl(chr, Address(ary1, limit, Address::times_1));
3792   cmpl(chr, Address(ary2, limit, Address::times_1));
3793   jccb(Assembler::notEqual, FALSE_LABEL);
3794   addptr(limit, 4);
3795   jcc(Assembler::notZero, COMPARE_VECTORS);
3796 
3797   // Compare trailing char (final 2 bytes), if any
3798   bind(COMPARE_CHAR);
3799   testl(result, 0x2);   // tail  char
3800   jccb(Assembler::zero, COMPARE_BYTE);
3801   load_unsigned_short(chr, Address(ary1, 0));
3802   load_unsigned_short(limit, Address(ary2, 0));
3803   cmpl(chr, limit);
3804   jccb(Assembler::notEqual, FALSE_LABEL);
3805 
3806   if (is_array_equ && is_char) {
3807     bind(COMPARE_BYTE);
3808   } else {
3809     lea(ary1, Address(ary1, 2));
3810     lea(ary2, Address(ary2, 2));
3811 
3812     bind(COMPARE_BYTE);
3813     testl(result, 0x1);   // tail  byte
3814     jccb(Assembler::zero, TRUE_LABEL);
3815     load_unsigned_byte(chr, Address(ary1, 0));
3816     load_unsigned_byte(limit, Address(ary2, 0));
3817     cmpl(chr, limit);
3818     jccb(Assembler::notEqual, FALSE_LABEL);
3819   }
3820   bind(TRUE_LABEL);
3821   movl(result, 1);   // return true
3822   jmpb(DONE);
3823 
3824   bind(FALSE_LABEL);
3825   xorl(result, result); // return false
3826 
3827   // That's it
3828   bind(DONE);
3829   if (UseAVX >= 2) {
3830     // clean upper bits of YMM registers
3831     vpxor(vec1, vec1);
3832     vpxor(vec2, vec2);
3833   }
3834 }
3835 
3836 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
3837                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
3838   switch(ideal_opc) {
3839     case Op_LShiftVS:
3840       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
3841     case Op_LShiftVI:
3842       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
3843     case Op_LShiftVL:
3844       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
3845     case Op_RShiftVS:
3846       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
3847     case Op_RShiftVI:
3848       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
3849     case Op_RShiftVL:
3850       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
3851     case Op_URShiftVS:
3852       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
3853     case Op_URShiftVI:
3854       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
3855     case Op_URShiftVL:
3856       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
3857     case Op_RotateRightV:
3858       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
3859     case Op_RotateLeftV:
3860       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
3861     default:
3862       fatal("Unsupported masked operation"); break;
3863   }
3864 }
3865 
3866 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
3867                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
3868                                     bool is_varshift) {
3869   switch (ideal_opc) {
3870     case Op_AddVB:
3871       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
3872     case Op_AddVS:
3873       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
3874     case Op_AddVI:
3875       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
3876     case Op_AddVL:
3877       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
3878     case Op_AddVF:
3879       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
3880     case Op_AddVD:
3881       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
3882     case Op_SubVB:
3883       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
3884     case Op_SubVS:
3885       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
3886     case Op_SubVI:
3887       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
3888     case Op_SubVL:
3889       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
3890     case Op_SubVF:
3891       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
3892     case Op_SubVD:
3893       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
3894     case Op_MulVS:
3895       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
3896     case Op_MulVI:
3897       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
3898     case Op_MulVL:
3899       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
3900     case Op_MulVF:
3901       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
3902     case Op_MulVD:
3903       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
3904     case Op_DivVF:
3905       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
3906     case Op_DivVD:
3907       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
3908     case Op_SqrtVF:
3909       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
3910     case Op_SqrtVD:
3911       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
3912     case Op_AbsVB:
3913       evpabsb(dst, mask, src2, merge, vlen_enc); break;
3914     case Op_AbsVS:
3915       evpabsw(dst, mask, src2, merge, vlen_enc); break;
3916     case Op_AbsVI:
3917       evpabsd(dst, mask, src2, merge, vlen_enc); break;
3918     case Op_AbsVL:
3919       evpabsq(dst, mask, src2, merge, vlen_enc); break;
3920     case Op_FmaVF:
3921       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
3922     case Op_FmaVD:
3923       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
3924     case Op_VectorRearrange:
3925       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
3926     case Op_LShiftVS:
3927       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3928     case Op_LShiftVI:
3929       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3930     case Op_LShiftVL:
3931       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3932     case Op_RShiftVS:
3933       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3934     case Op_RShiftVI:
3935       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3936     case Op_RShiftVL:
3937       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3938     case Op_URShiftVS:
3939       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3940     case Op_URShiftVI:
3941       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3942     case Op_URShiftVL:
3943       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
3944     case Op_RotateLeftV:
3945       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
3946     case Op_RotateRightV:
3947       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
3948     case Op_MaxV:
3949       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
3950     case Op_MinV:
3951       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
3952     case Op_XorV:
3953       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
3954     case Op_OrV:
3955       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
3956     case Op_AndV:
3957       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
3958     default:
3959       fatal("Unsupported masked operation"); break;
3960   }
3961 }
3962 
3963 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
3964                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
3965   switch (ideal_opc) {
3966     case Op_AddVB:
3967       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
3968     case Op_AddVS:
3969       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
3970     case Op_AddVI:
3971       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
3972     case Op_AddVL:
3973       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
3974     case Op_AddVF:
3975       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
3976     case Op_AddVD:
3977       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
3978     case Op_SubVB:
3979       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
3980     case Op_SubVS:
3981       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
3982     case Op_SubVI:
3983       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
3984     case Op_SubVL:
3985       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
3986     case Op_SubVF:
3987       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
3988     case Op_SubVD:
3989       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
3990     case Op_MulVS:
3991       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
3992     case Op_MulVI:
3993       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
3994     case Op_MulVL:
3995       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
3996     case Op_MulVF:
3997       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
3998     case Op_MulVD:
3999       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4000     case Op_DivVF:
4001       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4002     case Op_DivVD:
4003       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4004     case Op_FmaVF:
4005       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4006     case Op_FmaVD:
4007       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4008     case Op_MaxV:
4009       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4010     case Op_MinV:
4011       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4012     case Op_XorV:
4013       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4014     case Op_OrV:
4015       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4016     case Op_AndV:
4017       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4018     default:
4019       fatal("Unsupported masked operation"); break;
4020   }
4021 }
4022 
4023 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4024                                   KRegister src1, KRegister src2) {
4025   BasicType etype = T_ILLEGAL;
4026   switch(mask_len) {
4027     case 2:
4028     case 4:
4029     case 8:  etype = T_BYTE; break;
4030     case 16: etype = T_SHORT; break;
4031     case 32: etype = T_INT; break;
4032     case 64: etype = T_LONG; break;
4033     default: fatal("Unsupported type"); break;
4034   }
4035   assert(etype != T_ILLEGAL, "");
4036   switch(ideal_opc) {
4037     case Op_AndVMask:
4038       kand(etype, dst, src1, src2); break;
4039     case Op_OrVMask:
4040       kor(etype, dst, src1, src2); break;
4041     case Op_XorVMask:
4042       kxor(etype, dst, src1, src2); break;
4043     default:
4044       fatal("Unsupported masked operation"); break;
4045   }
4046 }
4047 
4048 /*
4049  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4050  * If src is NaN, the result is 0.
4051  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4052  * the result is equal to the value of Integer.MIN_VALUE.
4053  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4054  * the result is equal to the value of Integer.MAX_VALUE.
4055  */
4056 void C2_MacroAssembler::vector_cast_float_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4057                                                             XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4058                                                             Register scratch, AddressLiteral float_sign_flip,
4059                                                             int vec_enc) {
4060   Label done;
4061   vmovdqu(xtmp1, float_sign_flip, scratch, vec_enc);
4062   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4063   vptest(xtmp2, xtmp2, vec_enc);
4064   jccb(Assembler::equal, done);
4065 
4066   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4067   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4068 
4069   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4070   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4071   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4072 
4073   // Recompute the mask for remaining special value.
4074   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4075   // Extract SRC values corresponding to TRUE mask lanes.
4076   vpand(xtmp4, xtmp2, src, vec_enc);
4077   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4078   // values are set.
4079   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4080 
4081   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4082   bind(done);
4083 }
4084 
4085 void C2_MacroAssembler::vector_cast_float_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4086                                                              XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4087                                                              Register scratch, AddressLiteral float_sign_flip,
4088                                                              int vec_enc) {
4089   Label done;
4090   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, scratch);
4091   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4092   kortestwl(ktmp1, ktmp1);
4093   jccb(Assembler::equal, done);
4094 
4095   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4096   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4097   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4098 
4099   kxorwl(ktmp1, ktmp1, ktmp2);
4100   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4101   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4102   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4103   bind(done);
4104 }
4105 
4106 /*
4107  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4108  * If src is NaN, the result is 0.
4109  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4110  * the result is equal to the value of Long.MIN_VALUE.
4111  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4112  * the result is equal to the value of Long.MAX_VALUE.
4113  */
4114 void C2_MacroAssembler::vector_cast_double_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4115                                                               XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4116                                                               Register scratch, AddressLiteral double_sign_flip,
4117                                                               int vec_enc) {
4118   Label done;
4119   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, scratch);
4120   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4121   kortestwl(ktmp1, ktmp1);
4122   jccb(Assembler::equal, done);
4123 
4124   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4125   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4126   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4127 
4128   kxorwl(ktmp1, ktmp1, ktmp2);
4129   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4130   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4131   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4132   bind(done);
4133 }
4134 
4135 /*
4136  * Algorithm for vector D2L and F2I conversions:-
4137  * a) Perform vector D2L/F2I cast.
4138  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
4139  *    It signifies that source value could be any of the special floating point
4140  *    values(NaN,-Inf,Inf,Max,-Min).
4141  * c) Set destination to zero if source is NaN value.
4142  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
4143  */
4144 
4145 void C2_MacroAssembler::vector_castD2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4146                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
4147                                             Register scratch, int vec_enc) {
4148   evcvttpd2qq(dst, src, vec_enc);
4149   vector_cast_double_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, scratch, double_sign_flip, vec_enc);
4150 }
4151 
4152 void C2_MacroAssembler::vector_castF2I_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4153                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4154                                            AddressLiteral float_sign_flip, Register scratch, int vec_enc) {
4155   vcvttps2dq(dst, src, vec_enc);
4156   vector_cast_float_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, scratch, float_sign_flip, vec_enc);
4157 }
4158 
4159 void C2_MacroAssembler::vector_castF2I_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4160                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
4161                                             Register scratch, int vec_enc) {
4162   vcvttps2dq(dst, src, vec_enc);
4163   vector_cast_float_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, scratch, float_sign_flip, vec_enc);
4164 }
4165 
4166 #ifdef _LP64
4167 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4168                                                  KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
4169                                                  AddressLiteral new_mxcsr, Register scratch, int vec_enc) {
4170   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4171   // and re-instantiate original MXCSR.RC mode after that.
4172   ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
4173   ldmxcsr(new_mxcsr, scratch);
4174   mov64(scratch, julong_cast(0.5L));
4175   evpbroadcastq(xtmp1, scratch, vec_enc);
4176   vaddpd(xtmp1, src , xtmp1, vec_enc);
4177   evcvtpd2qq(dst, xtmp1, vec_enc);
4178   vector_cast_double_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, scratch, double_sign_flip, vec_enc);
4179   ldmxcsr(mxcsr_std, scratch);
4180 }
4181 
4182 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4183                                                 KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
4184                                                 AddressLiteral new_mxcsr, Register scratch, int vec_enc) {
4185   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4186   // and re-instantiate original MXCSR.RC mode after that.
4187   ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
4188   ldmxcsr(new_mxcsr, scratch);
4189   movl(scratch, jint_cast(0.5));
4190   movq(xtmp1, scratch);
4191   vbroadcastss(xtmp1, xtmp1, vec_enc);
4192   vaddps(xtmp1, src , xtmp1, vec_enc);
4193   vcvtps2dq(dst, xtmp1, vec_enc);
4194   vector_cast_float_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, scratch, float_sign_flip, vec_enc);
4195   ldmxcsr(mxcsr_std, scratch);
4196 }
4197 
4198 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4199                                                XMMRegister xtmp3, XMMRegister xtmp4, AddressLiteral float_sign_flip,
4200                                                AddressLiteral new_mxcsr, Register scratch, int vec_enc) {
4201   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4202   // and re-instantiate original MXCSR.RC mode after that.
4203   ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
4204   ldmxcsr(new_mxcsr, scratch);
4205   movl(scratch, jint_cast(0.5));
4206   movq(xtmp1, scratch);
4207   vbroadcastss(xtmp1, xtmp1, vec_enc);
4208   vaddps(xtmp1, src , xtmp1, vec_enc);
4209   vcvtps2dq(dst, xtmp1, vec_enc);
4210   vector_cast_float_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, scratch, float_sign_flip, vec_enc);
4211   ldmxcsr(mxcsr_std, scratch);
4212 }
4213 #endif
4214 
4215 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
4216                                              BasicType from_elem_bt, BasicType to_elem_bt) {
4217   switch (from_elem_bt) {
4218     case T_BYTE:
4219       switch (to_elem_bt) {
4220         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
4221         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
4222         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
4223         default: ShouldNotReachHere();
4224       }
4225       break;
4226     case T_SHORT:
4227       switch (to_elem_bt) {
4228         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
4229         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
4230         default: ShouldNotReachHere();
4231       }
4232       break;
4233     case T_INT:
4234       assert(to_elem_bt == T_LONG, "");
4235       vpmovzxdq(dst, src, vlen_enc);
4236       break;
4237     default:
4238       ShouldNotReachHere();
4239   }
4240 }
4241 
4242 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
4243                                    bool merge, BasicType bt, int vlen_enc) {
4244   if (bt == T_INT) {
4245     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
4246   } else {
4247     assert(bt == T_LONG, "");
4248     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
4249   }
4250 }
4251 
4252 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
4253                                    bool merge, BasicType bt, int vlen_enc) {
4254   if (bt == T_INT) {
4255     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
4256   } else {
4257     assert(bt == T_LONG, "");
4258     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
4259   }
4260 }
4261 
4262 #ifdef _LP64
4263 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
4264                                                Register rtmp2, XMMRegister xtmp, int mask_len,
4265                                                int vec_enc) {
4266   int index = 0;
4267   int vindex = 0;
4268   mov64(rtmp1, 0x0101010101010101L);
4269   pdep(rtmp1, src, rtmp1);
4270   if (mask_len > 8) {
4271     movq(rtmp2, src);
4272     vpxor(xtmp, xtmp, xtmp, vec_enc);
4273     movq(xtmp, rtmp1);
4274   }
4275   movq(dst, rtmp1);
4276 
4277   mask_len -= 8;
4278   while (mask_len > 0) {
4279     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
4280     index++;
4281     if ((index % 2) == 0) {
4282       pxor(xtmp, xtmp);
4283     }
4284     mov64(rtmp1, 0x0101010101010101L);
4285     shrq(rtmp2, 8);
4286     pdep(rtmp1, rtmp2, rtmp1);
4287     pinsrq(xtmp, rtmp1, index % 2);
4288     vindex = index / 2;
4289     if (vindex) {
4290       // Write entire 16 byte vector when both 64 bit
4291       // lanes are update to save redundant instructions.
4292       if (index % 2) {
4293         vinsertf128(dst, dst, xtmp, vindex);
4294       }
4295     } else {
4296       vmovdqu(dst, xtmp);
4297     }
4298     mask_len -= 8;
4299   }
4300 }
4301 
4302 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
4303   switch(opc) {
4304     case Op_VectorMaskTrueCount:
4305       popcntq(dst, tmp);
4306       break;
4307     case Op_VectorMaskLastTrue:
4308       if (VM_Version::supports_lzcnt()) {
4309         lzcntq(tmp, tmp);
4310         movl(dst, 63);
4311         subl(dst, tmp);
4312       } else {
4313         movl(dst, -1);
4314         bsrq(tmp, tmp);
4315         cmov32(Assembler::notZero, dst, tmp);
4316       }
4317       break;
4318     case Op_VectorMaskFirstTrue:
4319       if (VM_Version::supports_bmi1()) {
4320         if (masklen < 32) {
4321           orl(tmp, 1 << masklen);
4322           tzcntl(dst, tmp);
4323         } else if (masklen == 32) {
4324           tzcntl(dst, tmp);
4325         } else {
4326           assert(masklen == 64, "");
4327           tzcntq(dst, tmp);
4328         }
4329       } else {
4330         if (masklen < 32) {
4331           orl(tmp, 1 << masklen);
4332           bsfl(dst, tmp);
4333         } else {
4334           assert(masklen == 32 || masklen == 64, "");
4335           movl(dst, masklen);
4336           if (masklen == 32)  {
4337             bsfl(tmp, tmp);
4338           } else {
4339             bsfq(tmp, tmp);
4340           }
4341           cmov32(Assembler::notZero, dst, tmp);
4342         }
4343       }
4344       break;
4345     case Op_VectorMaskToLong:
4346       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
4347       break;
4348     default: assert(false, "Unhandled mask operation");
4349   }
4350 }
4351 
4352 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
4353                                               int masklen, int masksize, int vec_enc) {
4354   assert(VM_Version::supports_popcnt(), "");
4355 
4356   if(VM_Version::supports_avx512bw()) {
4357     kmovql(tmp, mask);
4358   } else {
4359     assert(masklen <= 16, "");
4360     kmovwl(tmp, mask);
4361   }
4362 
4363   // Mask generated out of partial vector comparisons/replicate/mask manipulation
4364   // operations needs to be clipped.
4365   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
4366     andq(tmp, (1 << masklen) - 1);
4367   }
4368 
4369   vector_mask_operation_helper(opc, dst, tmp, masklen);
4370 }
4371 
4372 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
4373                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
4374   assert(vec_enc == AVX_128bit && VM_Version::supports_avx() ||
4375          vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4), "");
4376   assert(VM_Version::supports_popcnt(), "");
4377 
4378   bool need_clip = false;
4379   switch(bt) {
4380     case T_BOOLEAN:
4381       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
4382       vpxor(xtmp, xtmp, xtmp, vec_enc);
4383       vpsubb(xtmp, xtmp, mask, vec_enc);
4384       vpmovmskb(tmp, xtmp, vec_enc);
4385       need_clip = masklen < 16;
4386       break;
4387     case T_BYTE:
4388       vpmovmskb(tmp, mask, vec_enc);
4389       need_clip = masklen < 16;
4390       break;
4391     case T_SHORT:
4392       vpacksswb(xtmp, mask, mask, vec_enc);
4393       if (masklen >= 16) {
4394         vpermpd(xtmp, xtmp, 8, vec_enc);
4395       }
4396       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
4397       need_clip = masklen < 16;
4398       break;
4399     case T_INT:
4400     case T_FLOAT:
4401       vmovmskps(tmp, mask, vec_enc);
4402       need_clip = masklen < 4;
4403       break;
4404     case T_LONG:
4405     case T_DOUBLE:
4406       vmovmskpd(tmp, mask, vec_enc);
4407       need_clip = masklen < 2;
4408       break;
4409     default: assert(false, "Unhandled type, %s", type2name(bt));
4410   }
4411 
4412   // Mask generated out of partial vector comparisons/replicate/mask manipulation
4413   // operations needs to be clipped.
4414   if (need_clip && opc != Op_VectorMaskFirstTrue) {
4415     // need_clip implies masklen < 32
4416     andq(tmp, (1 << masklen) - 1);
4417   }
4418 
4419   vector_mask_operation_helper(opc, dst, tmp, masklen);
4420 }
4421 
4422 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
4423                                              Register rtmp2, int mask_len) {
4424   kmov(rtmp1, src);
4425   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
4426   mov64(rtmp2, -1L);
4427   pext(rtmp2, rtmp2, rtmp1);
4428   kmov(dst, rtmp2);
4429 }
4430 
4431 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
4432                                                bool merge, BasicType bt, int vec_enc) {
4433   if (opcode == Op_CompressV) {
4434     switch(bt) {
4435     case T_BYTE:
4436       evpcompressb(dst, mask, src, merge, vec_enc);
4437       break;
4438     case T_CHAR:
4439     case T_SHORT:
4440       evpcompressw(dst, mask, src, merge, vec_enc);
4441       break;
4442     case T_INT:
4443       evpcompressd(dst, mask, src, merge, vec_enc);
4444       break;
4445     case T_FLOAT:
4446       evcompressps(dst, mask, src, merge, vec_enc);
4447       break;
4448     case T_LONG:
4449       evpcompressq(dst, mask, src, merge, vec_enc);
4450       break;
4451     case T_DOUBLE:
4452       evcompresspd(dst, mask, src, merge, vec_enc);
4453       break;
4454     default:
4455       fatal("Unsupported type");
4456       break;
4457     }
4458   } else {
4459     assert(opcode == Op_ExpandV, "");
4460     switch(bt) {
4461     case T_BYTE:
4462       evpexpandb(dst, mask, src, merge, vec_enc);
4463       break;
4464     case T_CHAR:
4465     case T_SHORT:
4466       evpexpandw(dst, mask, src, merge, vec_enc);
4467       break;
4468     case T_INT:
4469       evpexpandd(dst, mask, src, merge, vec_enc);
4470       break;
4471     case T_FLOAT:
4472       evexpandps(dst, mask, src, merge, vec_enc);
4473       break;
4474     case T_LONG:
4475       evpexpandq(dst, mask, src, merge, vec_enc);
4476       break;
4477     case T_DOUBLE:
4478       evexpandpd(dst, mask, src, merge, vec_enc);
4479       break;
4480     default:
4481       fatal("Unsupported type");
4482       break;
4483     }
4484   }
4485 }
4486 #endif
4487 
4488 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
4489   if (VM_Version::supports_avx512bw()) {
4490     if (mask_len > 32) {
4491       kmovql(dst, src);
4492     } else {
4493       kmovdl(dst, src);
4494       if (mask_len != 32) {
4495         kshiftrdl(dst, dst, 32 - mask_len);
4496       }
4497     }
4498   } else {
4499     assert(mask_len <= 16, "");
4500     kmovwl(dst, src);
4501     if (mask_len != 16) {
4502       kshiftrwl(dst, dst, 16 - mask_len);
4503     }
4504   }
4505 }
4506 
4507 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
4508   int lane_size = type2aelembytes(bt);
4509   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
4510   if ((is_LP64 || lane_size < 8) &&
4511       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
4512        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
4513     movptr(rtmp, imm32);
4514     switch(lane_size) {
4515       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
4516       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
4517       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
4518       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
4519       default : ShouldNotReachHere(); break;
4520     }
4521   } else {
4522     movptr(rtmp, imm32);
4523     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
4524     switch(lane_size) {
4525       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
4526       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
4527       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
4528       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
4529       default : ShouldNotReachHere(); break;
4530     }
4531   }
4532 }
4533 
4534 //
4535 // Following is lookup table based popcount computation algorithm:-
4536 //       Index   Bit set count
4537 //     [ 0000 ->   0,
4538 //       0001 ->   1,
4539 //       0010 ->   1,
4540 //       0011 ->   2,
4541 //       0100 ->   1,
4542 //       0101 ->   2,
4543 //       0110 ->   2,
4544 //       0111 ->   3,
4545 //       1000 ->   1,
4546 //       1001 ->   2,
4547 //       1010 ->   3,
4548 //       1011 ->   3,
4549 //       1100 ->   2,
4550 //       1101 ->   3,
4551 //       1111 ->   4 ]
4552 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
4553 //     shuffle indices for lookup table access.
4554 //  b. Right shift each byte of vector lane by 4 positions.
4555 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
4556 //     shuffle indices for lookup table access.
4557 //  d. Add the bitset count of upper and lower 4 bits of each byte.
4558 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
4559 //     count of all the bytes of a quadword.
4560 //  f. Perform step e. for upper 128bit vector lane.
4561 //  g. Pack the bitset count of quadwords back to double word.
4562 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
4563 
4564 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4565                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
4566   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
4567   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
4568   vpsrlw(dst, src, 4, vec_enc);
4569   vpand(dst, dst, xtmp1, vec_enc);
4570   vpand(xtmp1, src, xtmp1, vec_enc);
4571   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), rtmp, vec_enc);
4572   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
4573   vpshufb(dst, xtmp2, dst, vec_enc);
4574   vpaddb(dst, dst, xtmp1, vec_enc);
4575 }
4576 
4577 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4578                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
4579   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
4580   // Following code is as per steps e,f,g and h of above algorithm.
4581   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4582   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
4583   vpsadbw(dst, dst, xtmp2, vec_enc);
4584   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
4585   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
4586   vpackuswb(dst, xtmp1, dst, vec_enc);
4587 }
4588 
4589 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4590                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
4591   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
4592   // Add the popcount of upper and lower bytes of word.
4593   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
4594   vpsrlw(dst, xtmp1, 8, vec_enc);
4595   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
4596   vpaddw(dst, dst, xtmp1, vec_enc);
4597 }
4598 
4599 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4600                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
4601   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
4602   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4603   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
4604 }
4605 
4606 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4607                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
4608   switch(bt) {
4609     case T_LONG:
4610       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
4611       break;
4612     case T_INT:
4613       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
4614       break;
4615     case T_CHAR:
4616     case T_SHORT:
4617       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
4618       break;
4619     case T_BYTE:
4620     case T_BOOLEAN:
4621       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
4622       break;
4623     default:
4624       ShouldNotReachHere();
4625   }
4626 }
4627 
4628 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
4629                                                       KRegister mask, bool merge, int vec_enc) {
4630   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
4631   switch(bt) {
4632     case T_LONG:
4633       assert(VM_Version::supports_avx512_vpopcntdq(), "");
4634       evpopcntq(dst, mask, src, merge, vec_enc);
4635       break;
4636     case T_INT:
4637       assert(VM_Version::supports_avx512_vpopcntdq(), "");
4638       evpopcntd(dst, mask, src, merge, vec_enc);
4639       break;
4640     case T_CHAR:
4641     case T_SHORT:
4642       assert(VM_Version::supports_avx512_bitalg(), "");
4643       evpopcntw(dst, mask, src, merge, vec_enc);
4644       break;
4645     case T_BYTE:
4646     case T_BOOLEAN:
4647       assert(VM_Version::supports_avx512_bitalg(), "");
4648       evpopcntb(dst, mask, src, merge, vec_enc);
4649       break;
4650     default:
4651       ShouldNotReachHere();
4652   }
4653 }
4654 
4655 #ifndef _LP64
4656 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
4657   assert(VM_Version::supports_avx512bw(), "");
4658   kmovdl(tmp, src);
4659   kunpckdql(dst, tmp, tmp);
4660 }
4661 #endif
4662 
4663 // Bit reversal algorithm first reverses the bits of each byte followed by
4664 // a byte level reversal for multi-byte primitive types (short/int/long).
4665 // Algorithm performs a lookup table access to get reverse bit sequence
4666 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
4667 // is obtained by swapping the reverse bit sequences of upper and lower
4668 // nibble of a byte.
4669 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4670                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
4671   if (VM_Version::supports_avx512vlbw()) {
4672 
4673     // Get the reverse bit sequence of lower nibble of each byte.
4674     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), rtmp, vec_enc);
4675     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
4676     vpandq(dst, xtmp2, src, vec_enc);
4677     vpshufb(dst, xtmp1, dst, vec_enc);
4678     vpsllq(dst, dst, 4, vec_enc);
4679 
4680     // Get the reverse bit sequence of upper nibble of each byte.
4681     vpandn(xtmp2, xtmp2, src, vec_enc);
4682     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
4683     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
4684 
4685     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
4686     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
4687     vporq(xtmp2, dst, xtmp2, vec_enc);
4688     vector_reverse_byte(bt, dst, xtmp2, rtmp, vec_enc);
4689 
4690   } else if(!VM_Version::supports_avx512vlbw() && vec_enc == Assembler::AVX_512bit) {
4691 
4692     // Shift based bit reversal.
4693     assert(bt == T_LONG || bt == T_INT, "");
4694     vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
4695 
4696     // Swap lower and upper nibble of each byte.
4697     vpandq(dst, xtmp1, src, vec_enc);
4698     vpsllq(dst, dst, 4, vec_enc);
4699     vpandn(xtmp2, xtmp1, src, vec_enc);
4700     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
4701     vporq(xtmp1, dst, xtmp2, vec_enc);
4702 
4703     // Swap two least and most significant bits of each nibble.
4704     vbroadcast(T_INT, xtmp2, 0x33333333, rtmp, vec_enc);
4705     vpandq(dst, xtmp2, xtmp1, vec_enc);
4706     vpsllq(dst, dst, 2, vec_enc);
4707     vpandn(xtmp2, xtmp2, xtmp1, vec_enc);
4708     vpsrlq(xtmp2, xtmp2, 2, vec_enc);
4709     vporq(xtmp1, dst, xtmp2, vec_enc);
4710 
4711     // Swap adjacent pair of bits.
4712     vbroadcast(T_INT, xtmp2, 0x55555555, rtmp, vec_enc);
4713     vpandq(dst, xtmp2, xtmp1, vec_enc);
4714     vpsllq(dst, dst, 1, vec_enc);
4715     vpandn(xtmp2, xtmp2, xtmp1, vec_enc);
4716     vpsrlq(xtmp2, xtmp2, 1, vec_enc);
4717     vporq(xtmp1, dst, xtmp2, vec_enc);
4718 
4719     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
4720 
4721   } else {
4722     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), rtmp, vec_enc);
4723     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
4724 
4725     // Get the reverse bit sequence of lower nibble of each byte.
4726     vpand(dst, xtmp2, src, vec_enc);
4727     vpshufb(dst, xtmp1, dst, vec_enc);
4728     vpsllq(dst, dst, 4, vec_enc);
4729 
4730     // Get the reverse bit sequence of upper nibble of each byte.
4731     vpandn(xtmp2, xtmp2, src, vec_enc);
4732     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
4733     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
4734 
4735     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
4736     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
4737     vpor(xtmp2, dst, xtmp2, vec_enc);
4738     vector_reverse_byte(bt, dst, xtmp2, rtmp, vec_enc);
4739   }
4740 }
4741 
4742 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src,
4743                                                 XMMRegister xtmp, AddressLiteral mask, Register rtmp, int vec_enc) {
4744   // Galois field instruction based bit reversal based on following algorithm.
4745   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
4746   assert(VM_Version::supports_gfni(), "");
4747   vpbroadcastq(xtmp, mask, vec_enc, rtmp);
4748   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
4749   vector_reverse_byte(bt, dst, xtmp, rtmp, vec_enc);
4750 }
4751 
4752 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4753                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
4754   // Shift based bit reversal.
4755   assert(VM_Version::supports_evex(), "");
4756   evmovdqul(xtmp1, k0, src, true, vec_enc);
4757   switch(bt) {
4758     case T_LONG:
4759       // Swap upper and lower double word of each quad word.
4760       evprorq(xtmp1, k0, xtmp1, 32, true, vec_enc);
4761     case T_INT:
4762       // Swap upper and lower word of each double word.
4763       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
4764     case T_SHORT:
4765       // Swap upper and lower byte of each word.
4766       vbroadcast(T_INT, dst, 0x00FF00FF, rtmp, vec_enc);
4767       vpandq(xtmp2, dst, xtmp1, vec_enc);
4768       vpsllq(xtmp2, xtmp2, 8, vec_enc);
4769       vpandn(xtmp1, dst, xtmp1, vec_enc);
4770       vpsrlq(dst, xtmp1, 8, vec_enc);
4771       vporq(dst, dst, xtmp2, vec_enc);
4772       break;
4773     case T_BYTE:
4774       evmovdquq(dst, k0, src, true, vec_enc);
4775       break;
4776     default:
4777       fatal("Unsupported type");
4778       break;
4779   }
4780 }
4781 
4782 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, Register rtmp, int vec_enc) {
4783   if (bt == T_BYTE) {
4784     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
4785       evmovdquq(dst, k0, src, true, vec_enc);
4786     } else {
4787       vmovdqu(dst, src);
4788     }
4789     return;
4790   }
4791   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
4792   // pre-computed shuffle indices.
4793   switch(bt) {
4794     case T_LONG:
4795       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), rtmp, vec_enc);
4796       break;
4797     case T_INT:
4798       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), rtmp, vec_enc);
4799       break;
4800     case T_SHORT:
4801       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), rtmp, vec_enc);
4802       break;
4803     default:
4804       fatal("Unsupported type");
4805       break;
4806   }
4807   vpshufb(dst, src, dst, vec_enc);
4808 }
4809 
4810 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
4811                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
4812                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
4813   assert(is_integral_type(bt), "");
4814   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
4815   assert(VM_Version::supports_avx512cd(), "");
4816   switch(bt) {
4817     case T_LONG:
4818       evplzcntq(dst, ktmp, src, merge, vec_enc);
4819       break;
4820     case T_INT:
4821       evplzcntd(dst, ktmp, src, merge, vec_enc);
4822       break;
4823     case T_SHORT:
4824       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
4825       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
4826       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
4827       vpunpckhwd(dst, xtmp1, src, vec_enc);
4828       evplzcntd(dst, ktmp, dst, merge, vec_enc);
4829       vpackusdw(dst, xtmp2, dst, vec_enc);
4830       break;
4831     case T_BYTE:
4832       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
4833       // accessing the lookup table.
4834       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
4835       // accessing the lookup table.
4836       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
4837       assert(VM_Version::supports_avx512bw(), "");
4838       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
4839       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
4840       vpand(xtmp2, dst, src, vec_enc);
4841       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
4842       vpsrlw(xtmp3, src, 4, vec_enc);
4843       vpand(xtmp3, dst, xtmp3, vec_enc);
4844       vpshufb(dst, xtmp1, xtmp3, vec_enc);
4845       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
4846       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
4847       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
4848       break;
4849     default:
4850       ShouldNotReachHere();
4851   }
4852 }
4853 
4854 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4855                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
4856   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
4857   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
4858   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
4859   // accessing the lookup table.
4860   vpand(dst, xtmp2, src, vec_enc);
4861   vpshufb(dst, xtmp1, dst, vec_enc);
4862   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
4863   // accessing the lookup table.
4864   vpsrlw(xtmp3, src, 4, vec_enc);
4865   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
4866   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
4867   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
4868   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
4869   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
4870   vpaddb(dst, dst, xtmp2, vec_enc);
4871   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
4872 }
4873 
4874 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4875                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
4876   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
4877   // Add zero counts of lower byte and upper byte of a word if
4878   // upper byte holds a zero value.
4879   vpsrlw(xtmp3, src, 8, vec_enc);
4880   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
4881   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
4882   vpsllw(xtmp2, dst, 8, vec_enc);
4883   vpaddw(xtmp2, xtmp2, dst, vec_enc);
4884   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
4885   vpsrlw(dst, dst, 8, vec_enc);
4886 }
4887 
4888 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4889                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
4890   // Since IEEE 754 floating point format represents mantissa in 1.0 format
4891   // hence biased exponent can be used to compute leading zero count as per
4892   // following formula:-
4893   // LZCNT = 32 - (biased_exp - 127)
4894   // Special handling has been introduced for Zero, Max_Int and -ve source values.
4895 
4896   // Broadcast 0xFF
4897   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
4898   vpsrld(xtmp1, xtmp1, 24, vec_enc);
4899 
4900   // Extract biased exponent.
4901   vcvtdq2ps(dst, src, vec_enc);
4902   vpsrld(dst, dst, 23, vec_enc);
4903   vpand(dst, dst, xtmp1, vec_enc);
4904 
4905   // Broadcast 127.
4906   vpsrld(xtmp1, xtmp1, 1, vec_enc);
4907   // Exponent = biased_exp - 127
4908   vpsubd(dst, dst, xtmp1, vec_enc);
4909 
4910   // Exponent = Exponent  + 1
4911   vpsrld(xtmp3, xtmp1, 6, vec_enc);
4912   vpaddd(dst, dst, xtmp3, vec_enc);
4913 
4914   // Replace -ve exponent with zero, exponent is -ve when src
4915   // lane contains a zero value.
4916   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4917   vblendvps(dst, dst, xtmp2, dst, vec_enc);
4918 
4919   // Rematerialize broadcast 32.
4920   vpslld(xtmp1, xtmp3, 5, vec_enc);
4921   // Exponent is 32 if corresponding source lane contains max_int value.
4922   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4923   // LZCNT = 32 - exponent
4924   vpsubd(dst, xtmp1, dst, vec_enc);
4925 
4926   // Replace LZCNT with a value 1 if corresponding source lane
4927   // contains max_int value.
4928   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
4929 
4930   // Replace biased_exp with 0 if source lane value is less than zero.
4931   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4932   vblendvps(dst, dst, xtmp2, src, vec_enc);
4933 }
4934 
4935 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4936                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
4937   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
4938   // Add zero counts of lower word and upper word of a double word if
4939   // upper word holds a zero value.
4940   vpsrld(xtmp3, src, 16, vec_enc);
4941   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
4942   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
4943   vpslld(xtmp2, dst, 16, vec_enc);
4944   vpaddd(xtmp2, xtmp2, dst, vec_enc);
4945   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
4946   vpsrld(dst, dst, 16, vec_enc);
4947   // Add zero counts of lower doubleword and upper doubleword of a
4948   // quadword if upper doubleword holds a zero value.
4949   vpsrlq(xtmp3, src, 32, vec_enc);
4950   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
4951   vpsllq(xtmp2, dst, 32, vec_enc);
4952   vpaddq(xtmp2, xtmp2, dst, vec_enc);
4953   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
4954   vpsrlq(dst, dst, 32, vec_enc);
4955 }
4956 
4957 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
4958                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
4959                                                        Register rtmp, int vec_enc) {
4960   assert(is_integral_type(bt), "unexpected type");
4961   assert(vec_enc < Assembler::AVX_512bit, "");
4962   switch(bt) {
4963     case T_LONG:
4964       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
4965       break;
4966     case T_INT:
4967       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
4968       break;
4969     case T_SHORT:
4970       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
4971       break;
4972     case T_BYTE:
4973       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
4974       break;
4975     default:
4976       ShouldNotReachHere();
4977   }
4978 }
4979 
4980 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
4981   switch(bt) {
4982     case T_BYTE:
4983       vpsubb(dst, src1, src2, vec_enc);
4984       break;
4985     case T_SHORT:
4986       vpsubw(dst, src1, src2, vec_enc);
4987       break;
4988     case T_INT:
4989       vpsubd(dst, src1, src2, vec_enc);
4990       break;
4991     case T_LONG:
4992       vpsubq(dst, src1, src2, vec_enc);
4993       break;
4994     default:
4995       ShouldNotReachHere();
4996   }
4997 }
4998 
4999 void C2_MacroAssembler::vpadd(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
5000   switch(bt) {
5001     case T_BYTE:
5002       vpaddb(dst, src1, src2, vec_enc);
5003       break;
5004     case T_SHORT:
5005       vpaddw(dst, src1, src2, vec_enc);
5006       break;
5007     case T_INT:
5008       vpaddd(dst, src1, src2, vec_enc);
5009       break;
5010     case T_LONG:
5011       vpaddq(dst, src1, src2, vec_enc);
5012       break;
5013     default:
5014       ShouldNotReachHere();
5015   }
5016 }
5017 
5018 // Trailing zero count computation is based on leading zero count operation as per
5019 // following equation. All AVX3 targets support AVX512CD feature which offers
5020 // direct vector instruction to compute leading zero count.
5021 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
5022 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5023                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5024                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
5025   assert(is_integral_type(bt), "");
5026   // xtmp = -1
5027   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
5028   // xtmp = xtmp + src
5029   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
5030   // xtmp = xtmp & ~src
5031   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
5032   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
5033   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
5034   vpsub(bt, dst, xtmp4, dst, vec_enc);
5035 }
5036 
5037 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
5038 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
5039 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5040                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5041   assert(is_integral_type(bt), "");
5042   // xtmp = 0
5043   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
5044   // xtmp = 0 - src
5045   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
5046   // xtmp = xtmp | src
5047   vpor(xtmp3, xtmp3, src, vec_enc);
5048   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
5049   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
5050   vpsub(bt, dst, xtmp1, dst, vec_enc);
5051 }
5052 
5053 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
5054   Label done;
5055   Label neg_divisor_fastpath;
5056   cmpl(divisor, 0);
5057   jccb(Assembler::less, neg_divisor_fastpath);
5058   xorl(rdx, rdx);
5059   divl(divisor);
5060   jmpb(done);
5061   bind(neg_divisor_fastpath);
5062   // Fastpath for divisor < 0:
5063   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
5064   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
5065   movl(rdx, rax);
5066   subl(rdx, divisor);
5067   if (VM_Version::supports_bmi1()) {
5068     andnl(rax, rdx, rax);
5069   } else {
5070     notl(rdx);
5071     andl(rax, rdx);
5072   }
5073   shrl(rax, 31);
5074   bind(done);
5075 }
5076 
5077 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
5078   Label done;
5079   Label neg_divisor_fastpath;
5080   cmpl(divisor, 0);
5081   jccb(Assembler::less, neg_divisor_fastpath);
5082   xorl(rdx, rdx);
5083   divl(divisor);
5084   jmpb(done);
5085   bind(neg_divisor_fastpath);
5086   // Fastpath when divisor < 0:
5087   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
5088   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
5089   movl(rdx, rax);
5090   subl(rax, divisor);
5091   if (VM_Version::supports_bmi1()) {
5092     andnl(rax, rax, rdx);
5093   } else {
5094     notl(rax);
5095     andl(rax, rdx);
5096   }
5097   sarl(rax, 31);
5098   andl(rax, divisor);
5099   subl(rdx, rax);
5100   bind(done);
5101 }
5102 
5103 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
5104   Label done;
5105   Label neg_divisor_fastpath;
5106 
5107   cmpl(divisor, 0);
5108   jccb(Assembler::less, neg_divisor_fastpath);
5109   xorl(rdx, rdx);
5110   divl(divisor);
5111   jmpb(done);
5112   bind(neg_divisor_fastpath);
5113   // Fastpath for divisor < 0:
5114   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
5115   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
5116   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
5117   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
5118   movl(rdx, rax);
5119   subl(rax, divisor);
5120   if (VM_Version::supports_bmi1()) {
5121     andnl(rax, rax, rdx);
5122   } else {
5123     notl(rax);
5124     andl(rax, rdx);
5125   }
5126   movl(tmp, rax);
5127   shrl(rax, 31); // quotient
5128   sarl(tmp, 31);
5129   andl(tmp, divisor);
5130   subl(rdx, tmp); // remainder
5131   bind(done);
5132 }
5133 
5134 #ifdef _LP64
5135 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
5136   Label done;
5137   Label neg_divisor_fastpath;
5138   cmpq(divisor, 0);
5139   jccb(Assembler::less, neg_divisor_fastpath);
5140   xorl(rdx, rdx);
5141   divq(divisor);
5142   jmpb(done);
5143   bind(neg_divisor_fastpath);
5144   // Fastpath for divisor < 0:
5145   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
5146   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
5147   movq(rdx, rax);
5148   subq(rdx, divisor);
5149   if (VM_Version::supports_bmi1()) {
5150     andnq(rax, rdx, rax);
5151   } else {
5152     notq(rdx);
5153     andq(rax, rdx);
5154   }
5155   shrq(rax, 63);
5156   bind(done);
5157 }
5158 
5159 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
5160   Label done;
5161   Label neg_divisor_fastpath;
5162   cmpq(divisor, 0);
5163   jccb(Assembler::less, neg_divisor_fastpath);
5164   xorq(rdx, rdx);
5165   divq(divisor);
5166   jmp(done);
5167   bind(neg_divisor_fastpath);
5168   // Fastpath when divisor < 0:
5169   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
5170   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
5171   movq(rdx, rax);
5172   subq(rax, divisor);
5173   if (VM_Version::supports_bmi1()) {
5174     andnq(rax, rax, rdx);
5175   } else {
5176     notq(rax);
5177     andq(rax, rdx);
5178   }
5179   sarq(rax, 63);
5180   andq(rax, divisor);
5181   subq(rdx, rax);
5182   bind(done);
5183 }
5184 
5185 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
5186   Label done;
5187   Label neg_divisor_fastpath;
5188   cmpq(divisor, 0);
5189   jccb(Assembler::less, neg_divisor_fastpath);
5190   xorq(rdx, rdx);
5191   divq(divisor);
5192   jmp(done);
5193   bind(neg_divisor_fastpath);
5194   // Fastpath for divisor < 0:
5195   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
5196   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
5197   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
5198   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
5199   movq(rdx, rax);
5200   subq(rax, divisor);
5201   if (VM_Version::supports_bmi1()) {
5202     andnq(rax, rax, rdx);
5203   } else {
5204     notq(rax);
5205     andq(rax, rdx);
5206   }
5207   movq(tmp, rax);
5208   shrq(rax, 63); // quotient
5209   sarq(tmp, 63);
5210   andq(tmp, divisor);
5211   subq(rdx, tmp); // remainder
5212   bind(done);
5213 }
5214 #endif