1 /*
   2  * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/globals.hpp"
  37 #include "runtime/objectMonitor.hpp"
  38 #include "runtime/stubRoutines.hpp"
  39 #include "utilities/globalDefinitions.hpp"
  40 #include "utilities/powerOfTwo.hpp"
  41 #include "utilities/sizes.hpp"
  42 
  43 #ifdef PRODUCT
  44 #define BLOCK_COMMENT(str) /* nothing */
  45 #define STOP(error) stop(error)
  46 #else
  47 #define BLOCK_COMMENT(str) block_comment(str)
  48 #define STOP(error) block_comment(error); stop(error)
  49 #endif
  50 
  51 // C2 compiled method's prolog code.
  52 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  53 
  54   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  55   // NativeJump::patch_verified_entry will be able to patch out the entry
  56   // code safely. The push to verify stack depth is ok at 5 bytes,
  57   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  58   // stack bang then we must use the 6 byte frame allocation even if
  59   // we have no frame. :-(
  60   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  61 
  62   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  63   // Remove word for return addr
  64   framesize -= wordSize;
  65   stack_bang_size -= wordSize;
  66 
  67   // Calls to C2R adapters often do not accept exceptional returns.
  68   // We require that their callers must bang for them.  But be careful, because
  69   // some VM calls (such as call site linkage) can use several kilobytes of
  70   // stack.  But the stack safety zone should account for that.
  71   // See bugs 4446381, 4468289, 4497237.
  72   if (stack_bang_size > 0) {
  73     generate_stack_overflow_check(stack_bang_size);
  74 
  75     // We always push rbp, so that on return to interpreter rbp, will be
  76     // restored correctly and we can correct the stack.
  77     push(rbp);
  78     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  79     if (PreserveFramePointer) {
  80       mov(rbp, rsp);
  81     }
  82     // Remove word for ebp
  83     framesize -= wordSize;
  84 
  85     // Create frame
  86     if (framesize) {
  87       subptr(rsp, framesize);
  88     }
  89   } else {
  90     // Create frame (force generation of a 4 byte immediate value)
  91     subptr_imm32(rsp, framesize);
  92 
  93     // Save RBP register now.
  94     framesize -= wordSize;
  95     movptr(Address(rsp, framesize), rbp);
  96     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  97     if (PreserveFramePointer) {
  98       movptr(rbp, rsp);
  99       if (framesize > 0) {
 100         addptr(rbp, framesize);
 101       }
 102     }
 103   }
 104 
 105   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 106     framesize -= wordSize;
 107     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 108   }
 109 
 110 #ifndef _LP64
 111   // If method sets FPU control word do it now
 112   if (fp_mode_24b) {
 113     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 114   }
 115   if (UseSSE >= 2 && VerifyFPU) {
 116     verify_FPU(0, "FPU stack must be clean on entry");
 117   }
 118 #endif
 119 
 120 #ifdef ASSERT
 121   if (VerifyStackAtCalls) {
 122     Label L;
 123     push(rax);
 124     mov(rax, rsp);
 125     andptr(rax, StackAlignmentInBytes-1);
 126     cmpptr(rax, StackAlignmentInBytes-wordSize);
 127     pop(rax);
 128     jcc(Assembler::equal, L);
 129     STOP("Stack is not properly aligned!");
 130     bind(L);
 131   }
 132 #endif
 133 
 134   if (!is_stub) {
 135     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 136  #ifdef _LP64
 137     if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
 138       // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 139       Label dummy_slow_path;
 140       Label dummy_continuation;
 141       Label* slow_path = &dummy_slow_path;
 142       Label* continuation = &dummy_continuation;
 143       if (!Compile::current()->output()->in_scratch_emit_size()) {
 144         // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 145         C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 146         Compile::current()->output()->add_stub(stub);
 147         slow_path = &stub->entry();
 148         continuation = &stub->continuation();
 149       }
 150       bs->nmethod_entry_barrier(this, slow_path, continuation);
 151     }
 152 #else
 153     // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 154     bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */);
 155 #endif
 156   }
 157 }
 158 
 159 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 160   switch (vlen_in_bytes) {
 161     case  4: // fall-through
 162     case  8: // fall-through
 163     case 16: return Assembler::AVX_128bit;
 164     case 32: return Assembler::AVX_256bit;
 165     case 64: return Assembler::AVX_512bit;
 166 
 167     default: {
 168       ShouldNotReachHere();
 169       return Assembler::AVX_NoVec;
 170     }
 171   }
 172 }
 173 
 174 #if INCLUDE_RTM_OPT
 175 
 176 // Update rtm_counters based on abort status
 177 // input: abort_status
 178 //        rtm_counters (RTMLockingCounters*)
 179 // flags are killed
 180 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
 181 
 182   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
 183   if (PrintPreciseRTMLockingStatistics) {
 184     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
 185       Label check_abort;
 186       testl(abort_status, (1<<i));
 187       jccb(Assembler::equal, check_abort);
 188       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
 189       bind(check_abort);
 190     }
 191   }
 192 }
 193 
 194 // Branch if (random & (count-1) != 0), count is 2^n
 195 // tmp, scr and flags are killed
 196 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
 197   assert(tmp == rax, "");
 198   assert(scr == rdx, "");
 199   rdtsc(); // modifies EDX:EAX
 200   andptr(tmp, count-1);
 201   jccb(Assembler::notZero, brLabel);
 202 }
 203 
 204 // Perform abort ratio calculation, set no_rtm bit if high ratio
 205 // input:  rtm_counters_Reg (RTMLockingCounters* address)
 206 // tmpReg, rtm_counters_Reg and flags are killed
 207 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
 208                                                     Register rtm_counters_Reg,
 209                                                     RTMLockingCounters* rtm_counters,
 210                                                     Metadata* method_data) {
 211   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 212 
 213   if (RTMLockingCalculationDelay > 0) {
 214     // Delay calculation
 215     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()));
 216     testptr(tmpReg, tmpReg);
 217     jccb(Assembler::equal, L_done);
 218   }
 219   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 220   //   Aborted transactions = abort_count * 100
 221   //   All transactions = total_count *  RTMTotalCountIncrRate
 222   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 223 
 224   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 225   cmpptr(tmpReg, RTMAbortThreshold);
 226   jccb(Assembler::below, L_check_always_rtm2);
 227   imulptr(tmpReg, tmpReg, 100);
 228 
 229   Register scrReg = rtm_counters_Reg;
 230   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 231   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 232   imulptr(scrReg, scrReg, RTMAbortRatio);
 233   cmpptr(tmpReg, scrReg);
 234   jccb(Assembler::below, L_check_always_rtm1);
 235   if (method_data != nullptr) {
 236     // set rtm_state to "no rtm" in MDO
 237     mov_metadata(tmpReg, method_data);
 238     lock();
 239     orl(Address(tmpReg, MethodData::rtm_state_offset()), NoRTM);
 240   }
 241   jmpb(L_done);
 242   bind(L_check_always_rtm1);
 243   // Reload RTMLockingCounters* address
 244   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 245   bind(L_check_always_rtm2);
 246   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 247   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 248   jccb(Assembler::below, L_done);
 249   if (method_data != nullptr) {
 250     // set rtm_state to "always rtm" in MDO
 251     mov_metadata(tmpReg, method_data);
 252     lock();
 253     orl(Address(tmpReg, MethodData::rtm_state_offset()), UseRTM);
 254   }
 255   bind(L_done);
 256 }
 257 
 258 // Update counters and perform abort ratio calculation
 259 // input:  abort_status_Reg
 260 // rtm_counters_Reg, flags are killed
 261 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 262                                       Register rtm_counters_Reg,
 263                                       RTMLockingCounters* rtm_counters,
 264                                       Metadata* method_data,
 265                                       bool profile_rtm) {
 266 
 267   assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 268   // update rtm counters based on rax value at abort
 269   // reads abort_status_Reg, updates flags
 270   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 271   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 272   if (profile_rtm) {
 273     // Save abort status because abort_status_Reg is used by following code.
 274     if (RTMRetryCount > 0) {
 275       push(abort_status_Reg);
 276     }
 277     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 278     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 279     // restore abort status
 280     if (RTMRetryCount > 0) {
 281       pop(abort_status_Reg);
 282     }
 283   }
 284 }
 285 
 286 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 287 // inputs: retry_count_Reg
 288 //       : abort_status_Reg
 289 // output: retry_count_Reg decremented by 1
 290 // flags are killed
 291 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 292   Label doneRetry;
 293   assert(abort_status_Reg == rax, "");
 294   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 295   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 296   // if reason is in 0x6 and retry count != 0 then retry
 297   andptr(abort_status_Reg, 0x6);
 298   jccb(Assembler::zero, doneRetry);
 299   testl(retry_count_Reg, retry_count_Reg);
 300   jccb(Assembler::zero, doneRetry);
 301   pause();
 302   decrementl(retry_count_Reg);
 303   jmp(retryLabel);
 304   bind(doneRetry);
 305 }
 306 
 307 // Spin and retry if lock is busy,
 308 // inputs: box_Reg (monitor address)
 309 //       : retry_count_Reg
 310 // output: retry_count_Reg decremented by 1
 311 //       : clear z flag if retry count exceeded
 312 // tmp_Reg, scr_Reg, flags are killed
 313 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 314                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 315   Label SpinLoop, SpinExit, doneRetry;
 316   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 317 
 318   testl(retry_count_Reg, retry_count_Reg);
 319   jccb(Assembler::zero, doneRetry);
 320   decrementl(retry_count_Reg);
 321   movptr(scr_Reg, RTMSpinLoopCount);
 322 
 323   bind(SpinLoop);
 324   pause();
 325   decrementl(scr_Reg);
 326   jccb(Assembler::lessEqual, SpinExit);
 327   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 328   testptr(tmp_Reg, tmp_Reg);
 329   jccb(Assembler::notZero, SpinLoop);
 330 
 331   bind(SpinExit);
 332   jmp(retryLabel);
 333   bind(doneRetry);
 334   incrementl(retry_count_Reg); // clear z flag
 335 }
 336 
 337 // Use RTM for normal stack locks
 338 // Input: objReg (object to lock)
 339 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 340                                          Register retry_on_abort_count_Reg,
 341                                          RTMLockingCounters* stack_rtm_counters,
 342                                          Metadata* method_data, bool profile_rtm,
 343                                          Label& DONE_LABEL, Label& IsInflated) {
 344   assert(UseRTMForStackLocks, "why call this otherwise?");
 345   assert(tmpReg == rax, "");
 346   assert(scrReg == rdx, "");
 347   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 348 
 349   if (RTMRetryCount > 0) {
 350     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 351     bind(L_rtm_retry);
 352   }
 353   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 354   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral
 355   jcc(Assembler::notZero, IsInflated);
 356 
 357   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 358     Label L_noincrement;
 359     if (RTMTotalCountIncrRate > 1) {
 360       // tmpReg, scrReg and flags are killed
 361       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 362     }
 363     assert(stack_rtm_counters != nullptr, "should not be null when profiling RTM");
 364     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 365     bind(L_noincrement);
 366   }
 367   xbegin(L_on_abort);
 368   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 369   andptr(tmpReg, markWord::lock_mask_in_place);     // look at 2 lock bits
 370   cmpptr(tmpReg, markWord::unlocked_value);         // bits = 01 unlocked
 371   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 372 
 373   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 374   if (UseRTMXendForLockBusy) {
 375     xend();
 376     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 377     jmp(L_decrement_retry);
 378   }
 379   else {
 380     xabort(0);
 381   }
 382   bind(L_on_abort);
 383   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 384     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 385   }
 386   bind(L_decrement_retry);
 387   if (RTMRetryCount > 0) {
 388     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 389     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 390   }
 391 }
 392 
 393 // Use RTM for inflating locks
 394 // inputs: objReg (object to lock)
 395 //         boxReg (on-stack box address (displaced header location) - KILLED)
 396 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 397 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 398                                             Register scrReg, Register retry_on_busy_count_Reg,
 399                                             Register retry_on_abort_count_Reg,
 400                                             RTMLockingCounters* rtm_counters,
 401                                             Metadata* method_data, bool profile_rtm,
 402                                             Label& DONE_LABEL) {
 403   assert(UseRTMLocking, "why call this otherwise?");
 404   assert(tmpReg == rax, "");
 405   assert(scrReg == rdx, "");
 406   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 407   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 408 
 409   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 410   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 411 
 412   if (RTMRetryCount > 0) {
 413     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 414     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 415     bind(L_rtm_retry);
 416   }
 417   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 418     Label L_noincrement;
 419     if (RTMTotalCountIncrRate > 1) {
 420       // tmpReg, scrReg and flags are killed
 421       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 422     }
 423     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 424     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 425     bind(L_noincrement);
 426   }
 427   xbegin(L_on_abort);
 428   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 429   movptr(tmpReg, Address(tmpReg, owner_offset));
 430   testptr(tmpReg, tmpReg);
 431   jcc(Assembler::zero, DONE_LABEL);
 432   if (UseRTMXendForLockBusy) {
 433     xend();
 434     jmp(L_decrement_retry);
 435   }
 436   else {
 437     xabort(0);
 438   }
 439   bind(L_on_abort);
 440   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 441   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 442     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 443   }
 444   if (RTMRetryCount > 0) {
 445     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 446     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 447   }
 448 
 449   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 450   testptr(tmpReg, tmpReg) ;
 451   jccb(Assembler::notZero, L_decrement_retry) ;
 452 
 453   // Appears unlocked - try to swing _owner from null to non-null.
 454   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 455 #ifdef _LP64
 456   Register threadReg = r15_thread;
 457 #else
 458   get_thread(scrReg);
 459   Register threadReg = scrReg;
 460 #endif
 461   lock();
 462   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 463 
 464   if (RTMRetryCount > 0) {
 465     // success done else retry
 466     jccb(Assembler::equal, DONE_LABEL) ;
 467     bind(L_decrement_retry);
 468     // Spin and retry if lock is busy.
 469     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 470   }
 471   else {
 472     bind(L_decrement_retry);
 473   }
 474 }
 475 
 476 #endif //  INCLUDE_RTM_OPT
 477 
 478 // fast_lock and fast_unlock used by C2
 479 
 480 // Because the transitions from emitted code to the runtime
 481 // monitorenter/exit helper stubs are so slow it's critical that
 482 // we inline both the stack-locking fast path and the inflated fast path.
 483 //
 484 // See also: cmpFastLock and cmpFastUnlock.
 485 //
 486 // What follows is a specialized inline transliteration of the code
 487 // in enter() and exit(). If we're concerned about I$ bloat another
 488 // option would be to emit TrySlowEnter and TrySlowExit methods
 489 // at startup-time.  These methods would accept arguments as
 490 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 491 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 492 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 493 // In practice, however, the # of lock sites is bounded and is usually small.
 494 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 495 // if the processor uses simple bimodal branch predictors keyed by EIP
 496 // Since the helper routines would be called from multiple synchronization
 497 // sites.
 498 //
 499 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 500 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 501 // to those specialized methods.  That'd give us a mostly platform-independent
 502 // implementation that the JITs could optimize and inline at their pleasure.
 503 // Done correctly, the only time we'd need to cross to native could would be
 504 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 505 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 506 // (b) explicit barriers or fence operations.
 507 //
 508 // TODO:
 509 //
 510 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 511 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 512 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 513 //    the lock operators would typically be faster than reifying Self.
 514 //
 515 // *  Ideally I'd define the primitives as:
 516 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 517 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 518 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 519 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 520 //    Furthermore the register assignments are overconstrained, possibly resulting in
 521 //    sub-optimal code near the synchronization site.
 522 //
 523 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 524 //    Alternately, use a better sp-proximity test.
 525 //
 526 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 527 //    Either one is sufficient to uniquely identify a thread.
 528 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 529 //
 530 // *  Intrinsify notify() and notifyAll() for the common cases where the
 531 //    object is locked by the calling thread but the waitlist is empty.
 532 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 533 //
 534 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 535 //    But beware of excessive branch density on AMD Opterons.
 536 //
 537 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 538 //    or failure of the fast path.  If the fast path fails then we pass
 539 //    control to the slow path, typically in C.  In fast_lock and
 540 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 541 //    will emit a conditional branch immediately after the node.
 542 //    So we have branches to branches and lots of ICC.ZF games.
 543 //    Instead, it might be better to have C2 pass a "FailureLabel"
 544 //    into fast_lock and fast_unlock.  In the case of success, control
 545 //    will drop through the node.  ICC.ZF is undefined at exit.
 546 //    In the case of failure, the node will branch directly to the
 547 //    FailureLabel
 548 
 549 
 550 // obj: object to lock
 551 // box: on-stack box address (displaced header location) - KILLED
 552 // rax,: tmp -- KILLED
 553 // scr: tmp -- KILLED
 554 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 555                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 556                                  RTMLockingCounters* rtm_counters,
 557                                  RTMLockingCounters* stack_rtm_counters,
 558                                  Metadata* method_data,
 559                                  bool use_rtm, bool profile_rtm) {
 560   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 561   // Ensure the register assignments are disjoint
 562   assert(tmpReg == rax, "");
 563 
 564   if (use_rtm) {
 565     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 566   } else {
 567     assert(cx1Reg == noreg, "");
 568     assert(cx2Reg == noreg, "");
 569     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 570   }
 571 
 572   // Possible cases that we'll encounter in fast_lock
 573   // ------------------------------------------------
 574   // * Inflated
 575   //    -- unlocked
 576   //    -- Locked
 577   //       = by self
 578   //       = by other
 579   // * neutral
 580   // * stack-locked
 581   //    -- by self
 582   //       = sp-proximity test hits
 583   //       = sp-proximity test generates false-negative
 584   //    -- by other
 585   //
 586 
 587   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 588 
 589   if (DiagnoseSyncOnValueBasedClasses != 0) {
 590     load_klass(tmpReg, objReg, scrReg);
 591     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 592     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 593     jcc(Assembler::notZero, DONE_LABEL);
 594   }
 595 
 596 #if INCLUDE_RTM_OPT
 597   if (UseRTMForStackLocks && use_rtm) {
 598     assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive");
 599     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 600                       stack_rtm_counters, method_data, profile_rtm,
 601                       DONE_LABEL, IsInflated);
 602   }
 603 #endif // INCLUDE_RTM_OPT
 604 
 605   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 606   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 607   jcc(Assembler::notZero, IsInflated);
 608 
 609   if (LockingMode == LM_MONITOR) {
 610     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 611     testptr(objReg, objReg);
 612   } else {
 613     assert(LockingMode == LM_LEGACY, "must be");
 614     // Attempt stack-locking ...
 615     orptr (tmpReg, markWord::unlocked_value);
 616     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 617     lock();
 618     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 619     jcc(Assembler::equal, COUNT);           // Success
 620 
 621     // Recursive locking.
 622     // The object is stack-locked: markword contains stack pointer to BasicLock.
 623     // Locked by current thread if difference with current SP is less than one page.
 624     subptr(tmpReg, rsp);
 625     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 626     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) );
 627     movptr(Address(boxReg, 0), tmpReg);
 628   }
 629   jmp(DONE_LABEL);
 630 
 631   bind(IsInflated);
 632   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 633 
 634 #if INCLUDE_RTM_OPT
 635   // Use the same RTM locking code in 32- and 64-bit VM.
 636   if (use_rtm) {
 637     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 638                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 639   } else {
 640 #endif // INCLUDE_RTM_OPT
 641 
 642 #ifndef _LP64
 643   // The object is inflated.
 644 
 645   // boxReg refers to the on-stack BasicLock in the current frame.
 646   // We'd like to write:
 647   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 648   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 649   // additional latency as we have another ST in the store buffer that must drain.
 650 
 651   // avoid ST-before-CAS
 652   // register juggle because we need tmpReg for cmpxchgptr below
 653   movptr(scrReg, boxReg);
 654   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 655 
 656   // Optimistic form: consider XORL tmpReg,tmpReg
 657   movptr(tmpReg, NULL_WORD);
 658 
 659   // Appears unlocked - try to swing _owner from null to non-null.
 660   // Ideally, I'd manifest "Self" with get_thread and then attempt
 661   // to CAS the register containing Self into m->Owner.
 662   // But we don't have enough registers, so instead we can either try to CAS
 663   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 664   // we later store "Self" into m->Owner.  Transiently storing a stack address
 665   // (rsp or the address of the box) into  m->owner is harmless.
 666   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 667   lock();
 668   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 669   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 670   // If we weren't able to swing _owner from null to the BasicLock
 671   // then take the slow path.
 672   jccb  (Assembler::notZero, NO_COUNT);
 673   // update _owner from BasicLock to thread
 674   get_thread (scrReg);                    // beware: clobbers ICCs
 675   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 676   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 677 
 678   // If the CAS fails we can either retry or pass control to the slow path.
 679   // We use the latter tactic.
 680   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 681   // If the CAS was successful ...
 682   //   Self has acquired the lock
 683   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 684   // Intentional fall-through into DONE_LABEL ...
 685 #else // _LP64
 686   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 687   movq(scrReg, tmpReg);
 688   xorq(tmpReg, tmpReg);
 689   lock();
 690   cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 691   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 692   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 693   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 694   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 695   jccb(Assembler::equal, COUNT);          // CAS above succeeded; propagate ZF = 1 (success)
 696 
 697   cmpptr(thread, rax);                // Check if we are already the owner (recursive lock)
 698   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 699   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 700   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 701 #endif // _LP64
 702 #if INCLUDE_RTM_OPT
 703   } // use_rtm()
 704 #endif
 705   bind(DONE_LABEL);
 706 
 707   // ZFlag == 1 count in fast path
 708   // ZFlag == 0 count in slow path
 709   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 710 
 711   bind(COUNT);
 712   // Count monitors in fast path
 713   increment(Address(thread, JavaThread::held_monitor_count_offset()));
 714 
 715   xorl(tmpReg, tmpReg); // Set ZF == 1
 716 
 717   bind(NO_COUNT);
 718 
 719   // At NO_COUNT the icc ZFlag is set as follows ...
 720   // fast_unlock uses the same protocol.
 721   // ZFlag == 1 -> Success
 722   // ZFlag == 0 -> Failure - force control through the slow path
 723 }
 724 
 725 // obj: object to unlock
 726 // box: box address (displaced header location), killed.  Must be EAX.
 727 // tmp: killed, cannot be obj nor box.
 728 //
 729 // Some commentary on balanced locking:
 730 //
 731 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 732 // Methods that don't have provably balanced locking are forced to run in the
 733 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 734 // The interpreter provides two properties:
 735 // I1:  At return-time the interpreter automatically and quietly unlocks any
 736 //      objects acquired the current activation (frame).  Recall that the
 737 //      interpreter maintains an on-stack list of locks currently held by
 738 //      a frame.
 739 // I2:  If a method attempts to unlock an object that is not held by the
 740 //      the frame the interpreter throws IMSX.
 741 //
 742 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 743 // B() doesn't have provably balanced locking so it runs in the interpreter.
 744 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 745 // is still locked by A().
 746 //
 747 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 748 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 749 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 750 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 751 // Arguably given that the spec legislates the JNI case as undefined our implementation
 752 // could reasonably *avoid* checking owner in fast_unlock().
 753 // In the interest of performance we elide m->Owner==Self check in unlock.
 754 // A perfectly viable alternative is to elide the owner check except when
 755 // Xcheck:jni is enabled.
 756 
 757 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 758   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 759   assert(boxReg == rax, "");
 760   assert_different_registers(objReg, boxReg, tmpReg);
 761 
 762   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 763 
 764 #if INCLUDE_RTM_OPT
 765   if (UseRTMForStackLocks && use_rtm) {
 766     assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive");
 767     Label L_regular_unlock;
 768     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 769     andptr(tmpReg, markWord::lock_mask_in_place);                     // look at 2 lock bits
 770     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 01 unlocked
 771     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 772     xend();                                                           // otherwise end...
 773     jmp(DONE_LABEL);                                                  // ... and we're done
 774     bind(L_regular_unlock);
 775   }
 776 #endif
 777 
 778   if (LockingMode == LM_LEGACY) {
 779     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 780     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 781   }
 782   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 783   if (LockingMode != LM_MONITOR) {
 784     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 785     jcc(Assembler::zero, Stacked);
 786   }
 787 
 788   // It's inflated.
 789 
 790 #if INCLUDE_RTM_OPT
 791   if (use_rtm) {
 792     Label L_regular_inflated_unlock;
 793     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 794     movptr(boxReg, Address(tmpReg, owner_offset));
 795     testptr(boxReg, boxReg);
 796     jccb(Assembler::notZero, L_regular_inflated_unlock);
 797     xend();
 798     jmp(DONE_LABEL);
 799     bind(L_regular_inflated_unlock);
 800   }
 801 #endif
 802 
 803   // Despite our balanced locking property we still check that m->_owner == Self
 804   // as java routines or native JNI code called by this thread might
 805   // have released the lock.
 806   // Refer to the comments in synchronizer.cpp for how we might encode extra
 807   // state in _succ so we can avoid fetching EntryList|cxq.
 808   //
 809   // If there's no contention try a 1-0 exit.  That is, exit without
 810   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 811   // we detect and recover from the race that the 1-0 exit admits.
 812   //
 813   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 814   // before it STs null into _owner, releasing the lock.  Updates
 815   // to data protected by the critical section must be visible before
 816   // we drop the lock (and thus before any other thread could acquire
 817   // the lock and observe the fields protected by the lock).
 818   // IA32's memory-model is SPO, so STs are ordered with respect to
 819   // each other and there's no need for an explicit barrier (fence).
 820   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 821 #ifndef _LP64
 822   // Note that we could employ various encoding schemes to reduce
 823   // the number of loads below (currently 4) to just 2 or 3.
 824   // Refer to the comments in synchronizer.cpp.
 825   // In practice the chain of fetches doesn't seem to impact performance, however.
 826   xorptr(boxReg, boxReg);
 827   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 828   jccb  (Assembler::notZero, DONE_LABEL);
 829   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 830   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 831   jccb  (Assembler::notZero, DONE_LABEL);
 832   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 833   jmpb  (DONE_LABEL);
 834 #else // _LP64
 835   // It's inflated
 836   Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath;
 837 
 838   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 839   jccb(Assembler::equal, LNotRecursive);
 840 
 841   // Recursive inflated unlock
 842   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 843   jmpb(LSuccess);
 844 
 845   bind(LNotRecursive);
 846   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 847   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 848   jccb  (Assembler::notZero, CheckSucc);
 849   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 850   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 851   jmpb  (DONE_LABEL);
 852 
 853   // Try to avoid passing control into the slow_path ...
 854   bind  (CheckSucc);
 855 
 856   // The following optional optimization can be elided if necessary
 857   // Effectively: if (succ == null) goto slow path
 858   // The code reduces the window for a race, however,
 859   // and thus benefits performance.
 860   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 861   jccb  (Assembler::zero, LGoSlowPath);
 862 
 863   xorptr(boxReg, boxReg);
 864   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 865   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 866 
 867   // Memory barrier/fence
 868   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 869   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 870   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 871   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 872   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 873   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 874   lock(); addl(Address(rsp, 0), 0);
 875 
 876   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 877   jccb  (Assembler::notZero, LSuccess);
 878 
 879   // Rare inopportune interleaving - race.
 880   // The successor vanished in the small window above.
 881   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 882   // We need to ensure progress and succession.
 883   // Try to reacquire the lock.
 884   // If that fails then the new owner is responsible for succession and this
 885   // thread needs to take no further action and can exit via the fast path (success).
 886   // If the re-acquire succeeds then pass control into the slow path.
 887   // As implemented, this latter mode is horrible because we generated more
 888   // coherence traffic on the lock *and* artificially extended the critical section
 889   // length while by virtue of passing control into the slow path.
 890 
 891   // box is really RAX -- the following CMPXCHG depends on that binding
 892   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 893   lock();
 894   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 895   // There's no successor so we tried to regrab the lock.
 896   // If that didn't work, then another thread grabbed the
 897   // lock so we're done (and exit was a success).
 898   jccb  (Assembler::notEqual, LSuccess);
 899   // Intentional fall-through into slow path
 900 
 901   bind  (LGoSlowPath);
 902   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 903   jmpb  (DONE_LABEL);
 904 
 905   bind  (LSuccess);
 906   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 907   jmpb  (DONE_LABEL);
 908 
 909 #endif
 910   if (LockingMode == LM_LEGACY) {
 911     bind  (Stacked);
 912     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 913     lock();
 914     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 915     // Intentional fall-thru into DONE_LABEL
 916   }
 917 
 918   bind(DONE_LABEL);
 919 
 920   // ZFlag == 1 count in fast path
 921   // ZFlag == 0 count in slow path
 922   jccb(Assembler::notZero, NO_COUNT);
 923 
 924   bind(COUNT);
 925   // Count monitors in fast path
 926 #ifndef _LP64
 927   get_thread(tmpReg);
 928   decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 929 #else // _LP64
 930   decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 931 #endif
 932 
 933   xorl(tmpReg, tmpReg); // Set ZF == 1
 934 
 935   bind(NO_COUNT);
 936 }
 937 
 938 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 939                                               Register t, Register thread) {
 940   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 941   assert(rax_reg == rax, "Used for CAS");
 942   assert_different_registers(obj, box, rax_reg, t, thread);
 943 
 944   // Handle inflated monitor.
 945   Label inflated;
 946   // Finish fast lock successfully. ZF value is irrelevant.
 947   Label locked;
 948   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 949   Label slow_path;
 950 
 951   if (DiagnoseSyncOnValueBasedClasses != 0) {
 952     load_klass(rax_reg, obj, t);
 953     movl(rax_reg, Address(rax_reg, Klass::access_flags_offset()));
 954     testl(rax_reg, JVM_ACC_IS_VALUE_BASED_CLASS);
 955     jcc(Assembler::notZero, slow_path);
 956   }
 957 
 958   const Register mark = t;
 959 
 960   { // Lightweight Lock
 961 
 962     Label push;
 963 
 964     const Register top = box;
 965 
 966     // Load the mark.
 967     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 968 
 969     // Prefetch top.
 970     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 971 
 972     // Check for monitor (0b10).
 973     testptr(mark, markWord::monitor_value);
 974     jcc(Assembler::notZero, inflated);
 975 
 976     // Check if lock-stack is full.
 977     cmpl(top, LockStack::end_offset() - 1);
 978     jcc(Assembler::greater, slow_path);
 979 
 980     // Check if recursive.
 981     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 982     jccb(Assembler::equal, push);
 983 
 984     // Try to lock. Transition lock bits 0b01 => 0b00
 985     movptr(rax_reg, mark);
 986     orptr(rax_reg, markWord::unlocked_value);
 987     andptr(mark, ~(int32_t)markWord::unlocked_value);
 988     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 989     jcc(Assembler::notEqual, slow_path);
 990 
 991     bind(push);
 992     // After successful lock, push object on lock-stack.
 993     movptr(Address(thread, top), obj);
 994     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 995     jmpb(locked);
 996   }
 997 
 998   { // Handle inflated monitor.
 999     bind(inflated);
1000 
1001     const Register tagged_monitor = mark;
1002 
1003     // CAS owner (null => current thread).
1004     xorptr(rax_reg, rax_reg);
1005     lock(); cmpxchgptr(thread, Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1006     jccb(Assembler::equal, locked);
1007 
1008     // Check if recursive.
1009     cmpptr(thread, rax_reg);
1010     jccb(Assembler::notEqual, slow_path);
1011 
1012     // Recursive.
1013     increment(Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1014   }
1015 
1016   bind(locked);
1017   increment(Address(thread, JavaThread::held_monitor_count_offset()));
1018   // Set ZF = 1
1019   xorl(rax_reg, rax_reg);
1020 
1021 #ifdef ASSERT
1022   // Check that locked label is reached with ZF set.
1023   Label zf_correct;
1024   jccb(Assembler::zero, zf_correct);
1025   stop("Fast Lock ZF != 1");
1026 #endif
1027 
1028   bind(slow_path);
1029 #ifdef ASSERT
1030   // Check that slow_path label is reached with ZF not set.
1031   jccb(Assembler::notZero, zf_correct);
1032   stop("Fast Lock ZF != 0");
1033   bind(zf_correct);
1034 #endif
1035   // C2 uses the value of ZF to determine the continuation.
1036 }
1037 
1038 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
1039   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
1040   assert(reg_rax == rax, "Used for CAS");
1041   assert_different_registers(obj, reg_rax, t);
1042 
1043   // Handle inflated monitor.
1044   Label inflated, inflated_check_lock_stack;
1045   // Finish fast unlock successfully.  MUST jump with ZF == 1
1046   Label unlocked;
1047 
1048   // Assume success.
1049   decrement(Address(thread, JavaThread::held_monitor_count_offset()));
1050 
1051   const Register mark = t;
1052   const Register top = reg_rax;
1053 
1054   Label dummy;
1055   C2FastUnlockLightweightStub* stub = nullptr;
1056 
1057   if (!Compile::current()->output()->in_scratch_emit_size()) {
1058     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
1059     Compile::current()->output()->add_stub(stub);
1060   }
1061 
1062   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
1063   Label& check_successor = stub == nullptr ? dummy : stub->check_successor();
1064 
1065   { // Lightweight Unlock
1066 
1067     // Load top.
1068     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
1069 
1070     // Prefetch mark.
1071     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
1072 
1073     // Check if obj is top of lock-stack.
1074     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
1075     // Top of lock stack was not obj. Must be monitor.
1076     jcc(Assembler::notEqual, inflated_check_lock_stack);
1077 
1078     // Pop lock-stack.
1079     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
1080     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
1081 
1082     // Check if recursive.
1083     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
1084     jcc(Assembler::equal, unlocked);
1085 
1086     // We elide the monitor check, let the CAS fail instead.
1087 
1088     // Try to unlock. Transition lock bits 0b00 => 0b01
1089     movptr(reg_rax, mark);
1090     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
1091     orptr(mark, markWord::unlocked_value);
1092     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
1093     jcc(Assembler::notEqual, push_and_slow_path);
1094     jmp(unlocked);
1095   }
1096 
1097 
1098   { // Handle inflated monitor.
1099     bind(inflated_check_lock_stack);
1100 #ifdef ASSERT
1101     Label check_done;
1102     subl(top, oopSize);
1103     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
1104     jcc(Assembler::below, check_done);
1105     cmpptr(obj, Address(thread, top));
1106     jccb(Assembler::notEqual, inflated_check_lock_stack);
1107     stop("Fast Unlock lock on stack");
1108     bind(check_done);
1109     testptr(mark, markWord::monitor_value);
1110     jccb(Assembler::notZero, inflated);
1111     stop("Fast Unlock not monitor");
1112 #endif
1113 
1114     bind(inflated);
1115 
1116     // mark contains the tagged ObjectMonitor*.
1117     const Register monitor = mark;
1118 
1119 #ifndef _LP64
1120     // Check if recursive.
1121     xorptr(reg_rax, reg_rax);
1122     orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1123     jcc(Assembler::notZero, check_successor);
1124 
1125     // Check if the entry lists are empty.
1126     movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
1127     orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
1128     jcc(Assembler::notZero, check_successor);
1129 
1130     // Release lock.
1131     movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
1132 #else // _LP64
1133     Label recursive;
1134 
1135     // Check if recursive.
1136     cmpptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
1137     jccb(Assembler::notEqual, recursive);
1138 
1139     // Check if the entry lists are empty.
1140     movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
1141     orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
1142     jcc(Assembler::notZero, check_successor);
1143 
1144     // Release lock.
1145     movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
1146     jmpb(unlocked);
1147 
1148     // Recursive unlock.
1149     bind(recursive);
1150     decrement(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1151     xorl(t, t);
1152 #endif
1153   }
1154 
1155   bind(unlocked);
1156   if (stub != nullptr) {
1157     bind(stub->unlocked_continuation());
1158   }
1159 
1160 #ifdef ASSERT
1161   // Check that unlocked label is reached with ZF set.
1162   Label zf_correct;
1163   jccb(Assembler::zero, zf_correct);
1164   stop("Fast Unlock ZF != 1");
1165 #endif
1166 
1167   if (stub != nullptr) {
1168     bind(stub->slow_path_continuation());
1169   }
1170 #ifdef ASSERT
1171   // Check that stub->continuation() label is reached with ZF not set.
1172   jccb(Assembler::notZero, zf_correct);
1173   stop("Fast Unlock ZF != 0");
1174   bind(zf_correct);
1175 #endif
1176   // C2 uses the value of ZF to determine the continuation.
1177 }
1178 
1179 //-------------------------------------------------------------------------------------------
1180 // Generic instructions support for use in .ad files C2 code generation
1181 
1182 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
1183   if (dst != src) {
1184     movdqu(dst, src);
1185   }
1186   if (opcode == Op_AbsVD) {
1187     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
1188   } else {
1189     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
1190     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1191   }
1192 }
1193 
1194 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
1195   if (opcode == Op_AbsVD) {
1196     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
1197   } else {
1198     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
1199     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
1200   }
1201 }
1202 
1203 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
1204   if (dst != src) {
1205     movdqu(dst, src);
1206   }
1207   if (opcode == Op_AbsVF) {
1208     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
1209   } else {
1210     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
1211     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1212   }
1213 }
1214 
1215 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
1216   if (opcode == Op_AbsVF) {
1217     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
1218   } else {
1219     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
1220     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
1221   }
1222 }
1223 
1224 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
1225   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1226   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
1227 
1228   if (opcode == Op_MinV) {
1229     if (elem_bt == T_BYTE) {
1230       pminsb(dst, src);
1231     } else if (elem_bt == T_SHORT) {
1232       pminsw(dst, src);
1233     } else if (elem_bt == T_INT) {
1234       pminsd(dst, src);
1235     } else {
1236       assert(elem_bt == T_LONG, "required");
1237       assert(tmp == xmm0, "required");
1238       assert_different_registers(dst, src, tmp);
1239       movdqu(xmm0, dst);
1240       pcmpgtq(xmm0, src);
1241       blendvpd(dst, src);  // xmm0 as mask
1242     }
1243   } else { // opcode == Op_MaxV
1244     if (elem_bt == T_BYTE) {
1245       pmaxsb(dst, src);
1246     } else if (elem_bt == T_SHORT) {
1247       pmaxsw(dst, src);
1248     } else if (elem_bt == T_INT) {
1249       pmaxsd(dst, src);
1250     } else {
1251       assert(elem_bt == T_LONG, "required");
1252       assert(tmp == xmm0, "required");
1253       assert_different_registers(dst, src, tmp);
1254       movdqu(xmm0, src);
1255       pcmpgtq(xmm0, dst);
1256       blendvpd(dst, src);  // xmm0 as mask
1257     }
1258   }
1259 }
1260 
1261 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1262                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1263                                  int vlen_enc) {
1264   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1265 
1266   if (opcode == Op_MinV) {
1267     if (elem_bt == T_BYTE) {
1268       vpminsb(dst, src1, src2, vlen_enc);
1269     } else if (elem_bt == T_SHORT) {
1270       vpminsw(dst, src1, src2, vlen_enc);
1271     } else if (elem_bt == T_INT) {
1272       vpminsd(dst, src1, src2, vlen_enc);
1273     } else {
1274       assert(elem_bt == T_LONG, "required");
1275       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1276         vpminsq(dst, src1, src2, vlen_enc);
1277       } else {
1278         assert_different_registers(dst, src1, src2);
1279         vpcmpgtq(dst, src1, src2, vlen_enc);
1280         vblendvpd(dst, src1, src2, dst, vlen_enc);
1281       }
1282     }
1283   } else { // opcode == Op_MaxV
1284     if (elem_bt == T_BYTE) {
1285       vpmaxsb(dst, src1, src2, vlen_enc);
1286     } else if (elem_bt == T_SHORT) {
1287       vpmaxsw(dst, src1, src2, vlen_enc);
1288     } else if (elem_bt == T_INT) {
1289       vpmaxsd(dst, src1, src2, vlen_enc);
1290     } else {
1291       assert(elem_bt == T_LONG, "required");
1292       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1293         vpmaxsq(dst, src1, src2, vlen_enc);
1294       } else {
1295         assert_different_registers(dst, src1, src2);
1296         vpcmpgtq(dst, src1, src2, vlen_enc);
1297         vblendvpd(dst, src2, src1, dst, vlen_enc);
1298       }
1299     }
1300   }
1301 }
1302 
1303 // Float/Double min max
1304 
1305 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1306                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1307                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1308                                    int vlen_enc) {
1309   assert(UseAVX > 0, "required");
1310   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1311          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1312   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1313   assert_different_registers(a, b, tmp, atmp, btmp);
1314 
1315   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1316   bool is_double_word = is_double_word_type(elem_bt);
1317 
1318   if (!is_double_word && is_min) {
1319     vblendvps(atmp, a, b, a, vlen_enc);
1320     vblendvps(btmp, b, a, a, vlen_enc);
1321     vminps(tmp, atmp, btmp, vlen_enc);
1322     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1323     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
1324   } else if (!is_double_word && !is_min) {
1325     vblendvps(btmp, b, a, b, vlen_enc);
1326     vblendvps(atmp, a, b, b, vlen_enc);
1327     vmaxps(tmp, atmp, btmp, vlen_enc);
1328     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1329     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
1330   } else if (is_double_word && is_min) {
1331     vblendvpd(atmp, a, b, a, vlen_enc);
1332     vblendvpd(btmp, b, a, a, vlen_enc);
1333     vminpd(tmp, atmp, btmp, vlen_enc);
1334     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1335     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1336   } else {
1337     assert(is_double_word && !is_min, "sanity");
1338     vblendvpd(btmp, b, a, b, vlen_enc);
1339     vblendvpd(atmp, a, b, b, vlen_enc);
1340     vmaxpd(tmp, atmp, btmp, vlen_enc);
1341     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1342     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1343   }
1344 }
1345 
1346 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1347                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1348                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1349                                     int vlen_enc) {
1350   assert(UseAVX > 2, "required");
1351   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1352          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1353   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1354   assert_different_registers(dst, a, b, atmp, btmp);
1355 
1356   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1357   bool is_double_word = is_double_word_type(elem_bt);
1358   bool merge = true;
1359 
1360   if (!is_double_word && is_min) {
1361     evpmovd2m(ktmp, a, vlen_enc);
1362     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1363     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1364     vminps(dst, atmp, btmp, vlen_enc);
1365     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1366     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1367   } else if (!is_double_word && !is_min) {
1368     evpmovd2m(ktmp, b, vlen_enc);
1369     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1370     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1371     vmaxps(dst, atmp, btmp, vlen_enc);
1372     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1373     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1374   } else if (is_double_word && is_min) {
1375     evpmovq2m(ktmp, a, vlen_enc);
1376     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1377     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1378     vminpd(dst, atmp, btmp, vlen_enc);
1379     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1380     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1381   } else {
1382     assert(is_double_word && !is_min, "sanity");
1383     evpmovq2m(ktmp, b, vlen_enc);
1384     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1385     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1386     vmaxpd(dst, atmp, btmp, vlen_enc);
1387     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1388     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1389   }
1390 }
1391 
1392 // Float/Double signum
1393 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1394   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1395 
1396   Label DONE_LABEL;
1397 
1398   if (opcode == Op_SignumF) {
1399     assert(UseSSE > 0, "required");
1400     ucomiss(dst, zero);
1401     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1402     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1403     movflt(dst, one);
1404     jcc(Assembler::above, DONE_LABEL);
1405     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1406   } else if (opcode == Op_SignumD) {
1407     assert(UseSSE > 1, "required");
1408     ucomisd(dst, zero);
1409     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1410     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1411     movdbl(dst, one);
1412     jcc(Assembler::above, DONE_LABEL);
1413     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1414   }
1415 
1416   bind(DONE_LABEL);
1417 }
1418 
1419 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1420   if (sign) {
1421     pmovsxbw(dst, src);
1422   } else {
1423     pmovzxbw(dst, src);
1424   }
1425 }
1426 
1427 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1428   if (sign) {
1429     vpmovsxbw(dst, src, vector_len);
1430   } else {
1431     vpmovzxbw(dst, src, vector_len);
1432   }
1433 }
1434 
1435 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1436   if (sign) {
1437     vpmovsxbd(dst, src, vector_len);
1438   } else {
1439     vpmovzxbd(dst, src, vector_len);
1440   }
1441 }
1442 
1443 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1444   if (sign) {
1445     vpmovsxwd(dst, src, vector_len);
1446   } else {
1447     vpmovzxwd(dst, src, vector_len);
1448   }
1449 }
1450 
1451 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1452                                      int shift, int vector_len) {
1453   if (opcode == Op_RotateLeftV) {
1454     if (etype == T_INT) {
1455       evprold(dst, src, shift, vector_len);
1456     } else {
1457       assert(etype == T_LONG, "expected type T_LONG");
1458       evprolq(dst, src, shift, vector_len);
1459     }
1460   } else {
1461     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1462     if (etype == T_INT) {
1463       evprord(dst, src, shift, vector_len);
1464     } else {
1465       assert(etype == T_LONG, "expected type T_LONG");
1466       evprorq(dst, src, shift, vector_len);
1467     }
1468   }
1469 }
1470 
1471 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1472                                      XMMRegister shift, int vector_len) {
1473   if (opcode == Op_RotateLeftV) {
1474     if (etype == T_INT) {
1475       evprolvd(dst, src, shift, vector_len);
1476     } else {
1477       assert(etype == T_LONG, "expected type T_LONG");
1478       evprolvq(dst, src, shift, vector_len);
1479     }
1480   } else {
1481     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1482     if (etype == T_INT) {
1483       evprorvd(dst, src, shift, vector_len);
1484     } else {
1485       assert(etype == T_LONG, "expected type T_LONG");
1486       evprorvq(dst, src, shift, vector_len);
1487     }
1488   }
1489 }
1490 
1491 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1492   if (opcode == Op_RShiftVI) {
1493     psrad(dst, shift);
1494   } else if (opcode == Op_LShiftVI) {
1495     pslld(dst, shift);
1496   } else {
1497     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1498     psrld(dst, shift);
1499   }
1500 }
1501 
1502 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1503   switch (opcode) {
1504     case Op_RShiftVI:  psrad(dst, shift); break;
1505     case Op_LShiftVI:  pslld(dst, shift); break;
1506     case Op_URShiftVI: psrld(dst, shift); break;
1507 
1508     default: assert(false, "%s", NodeClassNames[opcode]);
1509   }
1510 }
1511 
1512 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1513   if (opcode == Op_RShiftVI) {
1514     vpsrad(dst, nds, shift, vector_len);
1515   } else if (opcode == Op_LShiftVI) {
1516     vpslld(dst, nds, shift, vector_len);
1517   } else {
1518     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1519     vpsrld(dst, nds, shift, vector_len);
1520   }
1521 }
1522 
1523 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1524   switch (opcode) {
1525     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1526     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1527     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1528 
1529     default: assert(false, "%s", NodeClassNames[opcode]);
1530   }
1531 }
1532 
1533 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1534   switch (opcode) {
1535     case Op_RShiftVB:  // fall-through
1536     case Op_RShiftVS:  psraw(dst, shift); break;
1537 
1538     case Op_LShiftVB:  // fall-through
1539     case Op_LShiftVS:  psllw(dst, shift);   break;
1540 
1541     case Op_URShiftVS: // fall-through
1542     case Op_URShiftVB: psrlw(dst, shift);  break;
1543 
1544     default: assert(false, "%s", NodeClassNames[opcode]);
1545   }
1546 }
1547 
1548 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1549   switch (opcode) {
1550     case Op_RShiftVB:  // fall-through
1551     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1552 
1553     case Op_LShiftVB:  // fall-through
1554     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1555 
1556     case Op_URShiftVS: // fall-through
1557     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1558 
1559     default: assert(false, "%s", NodeClassNames[opcode]);
1560   }
1561 }
1562 
1563 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1564   switch (opcode) {
1565     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1566     case Op_LShiftVL:  psllq(dst, shift); break;
1567     case Op_URShiftVL: psrlq(dst, shift); break;
1568 
1569     default: assert(false, "%s", NodeClassNames[opcode]);
1570   }
1571 }
1572 
1573 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1574   if (opcode == Op_RShiftVL) {
1575     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1576   } else if (opcode == Op_LShiftVL) {
1577     psllq(dst, shift);
1578   } else {
1579     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1580     psrlq(dst, shift);
1581   }
1582 }
1583 
1584 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1585   switch (opcode) {
1586     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1587     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1588     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1589 
1590     default: assert(false, "%s", NodeClassNames[opcode]);
1591   }
1592 }
1593 
1594 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1595   if (opcode == Op_RShiftVL) {
1596     evpsraq(dst, nds, shift, vector_len);
1597   } else if (opcode == Op_LShiftVL) {
1598     vpsllq(dst, nds, shift, vector_len);
1599   } else {
1600     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1601     vpsrlq(dst, nds, shift, vector_len);
1602   }
1603 }
1604 
1605 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1606   switch (opcode) {
1607     case Op_RShiftVB:  // fall-through
1608     case Op_RShiftVS:  // fall-through
1609     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1610 
1611     case Op_LShiftVB:  // fall-through
1612     case Op_LShiftVS:  // fall-through
1613     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1614 
1615     case Op_URShiftVB: // fall-through
1616     case Op_URShiftVS: // fall-through
1617     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1618 
1619     default: assert(false, "%s", NodeClassNames[opcode]);
1620   }
1621 }
1622 
1623 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1624   switch (opcode) {
1625     case Op_RShiftVB:  // fall-through
1626     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1627 
1628     case Op_LShiftVB:  // fall-through
1629     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1630 
1631     case Op_URShiftVB: // fall-through
1632     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1633 
1634     default: assert(false, "%s", NodeClassNames[opcode]);
1635   }
1636 }
1637 
1638 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1639   assert(UseAVX >= 2, "required");
1640   switch (opcode) {
1641     case Op_RShiftVL: {
1642       if (UseAVX > 2) {
1643         assert(tmp == xnoreg, "not used");
1644         if (!VM_Version::supports_avx512vl()) {
1645           vlen_enc = Assembler::AVX_512bit;
1646         }
1647         evpsravq(dst, src, shift, vlen_enc);
1648       } else {
1649         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1650         vpsrlvq(dst, src, shift, vlen_enc);
1651         vpsrlvq(tmp, tmp, shift, vlen_enc);
1652         vpxor(dst, dst, tmp, vlen_enc);
1653         vpsubq(dst, dst, tmp, vlen_enc);
1654       }
1655       break;
1656     }
1657     case Op_LShiftVL: {
1658       assert(tmp == xnoreg, "not used");
1659       vpsllvq(dst, src, shift, vlen_enc);
1660       break;
1661     }
1662     case Op_URShiftVL: {
1663       assert(tmp == xnoreg, "not used");
1664       vpsrlvq(dst, src, shift, vlen_enc);
1665       break;
1666     }
1667     default: assert(false, "%s", NodeClassNames[opcode]);
1668   }
1669 }
1670 
1671 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1672 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1673   assert(opcode == Op_LShiftVB ||
1674          opcode == Op_RShiftVB ||
1675          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1676   bool sign = (opcode != Op_URShiftVB);
1677   assert(vector_len == 0, "required");
1678   vextendbd(sign, dst, src, 1);
1679   vpmovzxbd(vtmp, shift, 1);
1680   varshiftd(opcode, dst, dst, vtmp, 1);
1681   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1682   vextracti128_high(vtmp, dst);
1683   vpackusdw(dst, dst, vtmp, 0);
1684 }
1685 
1686 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1687 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1688   assert(opcode == Op_LShiftVB ||
1689          opcode == Op_RShiftVB ||
1690          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1691   bool sign = (opcode != Op_URShiftVB);
1692   int ext_vector_len = vector_len + 1;
1693   vextendbw(sign, dst, src, ext_vector_len);
1694   vpmovzxbw(vtmp, shift, ext_vector_len);
1695   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1696   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1697   if (vector_len == 0) {
1698     vextracti128_high(vtmp, dst);
1699     vpackuswb(dst, dst, vtmp, vector_len);
1700   } else {
1701     vextracti64x4_high(vtmp, dst);
1702     vpackuswb(dst, dst, vtmp, vector_len);
1703     vpermq(dst, dst, 0xD8, vector_len);
1704   }
1705 }
1706 
1707 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1708   switch(typ) {
1709     case T_BYTE:
1710       pinsrb(dst, val, idx);
1711       break;
1712     case T_SHORT:
1713       pinsrw(dst, val, idx);
1714       break;
1715     case T_INT:
1716       pinsrd(dst, val, idx);
1717       break;
1718     case T_LONG:
1719       pinsrq(dst, val, idx);
1720       break;
1721     default:
1722       assert(false,"Should not reach here.");
1723       break;
1724   }
1725 }
1726 
1727 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1728   switch(typ) {
1729     case T_BYTE:
1730       vpinsrb(dst, src, val, idx);
1731       break;
1732     case T_SHORT:
1733       vpinsrw(dst, src, val, idx);
1734       break;
1735     case T_INT:
1736       vpinsrd(dst, src, val, idx);
1737       break;
1738     case T_LONG:
1739       vpinsrq(dst, src, val, idx);
1740       break;
1741     default:
1742       assert(false,"Should not reach here.");
1743       break;
1744   }
1745 }
1746 
1747 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1748   switch(typ) {
1749     case T_INT:
1750       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1751       break;
1752     case T_FLOAT:
1753       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1754       break;
1755     case T_LONG:
1756       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1757       break;
1758     case T_DOUBLE:
1759       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1760       break;
1761     default:
1762       assert(false,"Should not reach here.");
1763       break;
1764   }
1765 }
1766 
1767 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1768   switch(typ) {
1769     case T_INT:
1770       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1771       break;
1772     case T_FLOAT:
1773       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1774       break;
1775     case T_LONG:
1776       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1777       break;
1778     case T_DOUBLE:
1779       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1780       break;
1781     default:
1782       assert(false,"Should not reach here.");
1783       break;
1784   }
1785 }
1786 
1787 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1788   switch(typ) {
1789     case T_INT:
1790       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1791       break;
1792     case T_FLOAT:
1793       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1794       break;
1795     case T_LONG:
1796       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1797       break;
1798     case T_DOUBLE:
1799       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1800       break;
1801     default:
1802       assert(false,"Should not reach here.");
1803       break;
1804   }
1805 }
1806 
1807 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1808   if (vlen_in_bytes <= 16) {
1809     pxor (dst, dst);
1810     psubb(dst, src);
1811     switch (elem_bt) {
1812       case T_BYTE:   /* nothing to do */ break;
1813       case T_SHORT:  pmovsxbw(dst, dst); break;
1814       case T_INT:    pmovsxbd(dst, dst); break;
1815       case T_FLOAT:  pmovsxbd(dst, dst); break;
1816       case T_LONG:   pmovsxbq(dst, dst); break;
1817       case T_DOUBLE: pmovsxbq(dst, dst); break;
1818 
1819       default: assert(false, "%s", type2name(elem_bt));
1820     }
1821   } else {
1822     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1823     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1824 
1825     vpxor (dst, dst, dst, vlen_enc);
1826     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1827 
1828     switch (elem_bt) {
1829       case T_BYTE:   /* nothing to do */            break;
1830       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1831       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1832       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1833       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1834       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1835 
1836       default: assert(false, "%s", type2name(elem_bt));
1837     }
1838   }
1839 }
1840 
1841 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1842   if (novlbwdq) {
1843     vpmovsxbd(xtmp, src, vlen_enc);
1844     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1845             Assembler::eq, true, vlen_enc, noreg);
1846   } else {
1847     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1848     vpsubb(xtmp, xtmp, src, vlen_enc);
1849     evpmovb2m(dst, xtmp, vlen_enc);
1850   }
1851 }
1852 
1853 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
1854   switch (vlen_in_bytes) {
1855     case 4:  movdl(dst, src);   break;
1856     case 8:  movq(dst, src);    break;
1857     case 16: movdqu(dst, src);  break;
1858     case 32: vmovdqu(dst, src); break;
1859     case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1860     default: ShouldNotReachHere();
1861   }
1862 }
1863 
1864 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1865   assert(rscratch != noreg || always_reachable(src), "missing");
1866 
1867   if (reachable(src)) {
1868     load_vector(dst, as_Address(src), vlen_in_bytes);
1869   } else {
1870     lea(rscratch, src);
1871     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
1872   }
1873 }
1874 
1875 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1876   int vlen_enc = vector_length_encoding(vlen);
1877   if (VM_Version::supports_avx()) {
1878     if (bt == T_LONG) {
1879       if (VM_Version::supports_avx2()) {
1880         vpbroadcastq(dst, src, vlen_enc);
1881       } else {
1882         vmovddup(dst, src, vlen_enc);
1883       }
1884     } else if (bt == T_DOUBLE) {
1885       if (vlen_enc != Assembler::AVX_128bit) {
1886         vbroadcastsd(dst, src, vlen_enc, noreg);
1887       } else {
1888         vmovddup(dst, src, vlen_enc);
1889       }
1890     } else {
1891       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1892         vpbroadcastd(dst, src, vlen_enc);
1893       } else {
1894         vbroadcastss(dst, src, vlen_enc);
1895       }
1896     }
1897   } else if (VM_Version::supports_sse3()) {
1898     movddup(dst, src);
1899   } else {
1900     movq(dst, src);
1901     if (vlen == 16) {
1902       punpcklqdq(dst, dst);
1903     }
1904   }
1905 }
1906 
1907 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1908   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1909   int offset = exact_log2(type2aelembytes(bt)) << 6;
1910   if (is_floating_point_type(bt)) {
1911     offset += 128;
1912   }
1913   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1914   load_vector(dst, addr, vlen_in_bytes);
1915 }
1916 
1917 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1918 
1919 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1920   int vector_len = Assembler::AVX_128bit;
1921 
1922   switch (opcode) {
1923     case Op_AndReductionV:  pand(dst, src); break;
1924     case Op_OrReductionV:   por (dst, src); break;
1925     case Op_XorReductionV:  pxor(dst, src); break;
1926     case Op_MinReductionV:
1927       switch (typ) {
1928         case T_BYTE:        pminsb(dst, src); break;
1929         case T_SHORT:       pminsw(dst, src); break;
1930         case T_INT:         pminsd(dst, src); break;
1931         case T_LONG:        assert(UseAVX > 2, "required");
1932                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1933         default:            assert(false, "wrong type");
1934       }
1935       break;
1936     case Op_MaxReductionV:
1937       switch (typ) {
1938         case T_BYTE:        pmaxsb(dst, src); break;
1939         case T_SHORT:       pmaxsw(dst, src); break;
1940         case T_INT:         pmaxsd(dst, src); break;
1941         case T_LONG:        assert(UseAVX > 2, "required");
1942                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1943         default:            assert(false, "wrong type");
1944       }
1945       break;
1946     case Op_AddReductionVF: addss(dst, src); break;
1947     case Op_AddReductionVD: addsd(dst, src); break;
1948     case Op_AddReductionVI:
1949       switch (typ) {
1950         case T_BYTE:        paddb(dst, src); break;
1951         case T_SHORT:       paddw(dst, src); break;
1952         case T_INT:         paddd(dst, src); break;
1953         default:            assert(false, "wrong type");
1954       }
1955       break;
1956     case Op_AddReductionVL: paddq(dst, src); break;
1957     case Op_MulReductionVF: mulss(dst, src); break;
1958     case Op_MulReductionVD: mulsd(dst, src); break;
1959     case Op_MulReductionVI:
1960       switch (typ) {
1961         case T_SHORT:       pmullw(dst, src); break;
1962         case T_INT:         pmulld(dst, src); break;
1963         default:            assert(false, "wrong type");
1964       }
1965       break;
1966     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1967                             evpmullq(dst, dst, src, vector_len); break;
1968     default:                assert(false, "wrong opcode");
1969   }
1970 }
1971 
1972 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1973   int vector_len = Assembler::AVX_256bit;
1974 
1975   switch (opcode) {
1976     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1977     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1978     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1979     case Op_MinReductionV:
1980       switch (typ) {
1981         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1982         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1983         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1984         case T_LONG:        assert(UseAVX > 2, "required");
1985                             vpminsq(dst, src1, src2, vector_len); break;
1986         default:            assert(false, "wrong type");
1987       }
1988       break;
1989     case Op_MaxReductionV:
1990       switch (typ) {
1991         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1992         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1993         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1994         case T_LONG:        assert(UseAVX > 2, "required");
1995                             vpmaxsq(dst, src1, src2, vector_len); break;
1996         default:            assert(false, "wrong type");
1997       }
1998       break;
1999     case Op_AddReductionVI:
2000       switch (typ) {
2001         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
2002         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
2003         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
2004         default:            assert(false, "wrong type");
2005       }
2006       break;
2007     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
2008     case Op_MulReductionVI:
2009       switch (typ) {
2010         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
2011         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
2012         default:            assert(false, "wrong type");
2013       }
2014       break;
2015     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
2016     default:                assert(false, "wrong opcode");
2017   }
2018 }
2019 
2020 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
2021                                   XMMRegister dst, XMMRegister src,
2022                                   XMMRegister vtmp1, XMMRegister vtmp2) {
2023   switch (opcode) {
2024     case Op_AddReductionVF:
2025     case Op_MulReductionVF:
2026       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
2027       break;
2028 
2029     case Op_AddReductionVD:
2030     case Op_MulReductionVD:
2031       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
2032       break;
2033 
2034     default: assert(false, "wrong opcode");
2035   }
2036 }
2037 
2038 void C2_MacroAssembler::reduceB(int opcode, int vlen,
2039                              Register dst, Register src1, XMMRegister src2,
2040                              XMMRegister vtmp1, XMMRegister vtmp2) {
2041   switch (vlen) {
2042     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2043     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2044     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2045     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2046 
2047     default: assert(false, "wrong vector length");
2048   }
2049 }
2050 
2051 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
2052                              Register dst, Register src1, XMMRegister src2,
2053                              XMMRegister vtmp1, XMMRegister vtmp2) {
2054   switch (vlen) {
2055     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2056     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2057     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2058     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2059 
2060     default: assert(false, "wrong vector length");
2061   }
2062 }
2063 
2064 void C2_MacroAssembler::reduceS(int opcode, int vlen,
2065                              Register dst, Register src1, XMMRegister src2,
2066                              XMMRegister vtmp1, XMMRegister vtmp2) {
2067   switch (vlen) {
2068     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2069     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2070     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2071     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2072 
2073     default: assert(false, "wrong vector length");
2074   }
2075 }
2076 
2077 void C2_MacroAssembler::reduceI(int opcode, int vlen,
2078                              Register dst, Register src1, XMMRegister src2,
2079                              XMMRegister vtmp1, XMMRegister vtmp2) {
2080   switch (vlen) {
2081     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2082     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2083     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2084     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2085 
2086     default: assert(false, "wrong vector length");
2087   }
2088 }
2089 
2090 #ifdef _LP64
2091 void C2_MacroAssembler::reduceL(int opcode, int vlen,
2092                              Register dst, Register src1, XMMRegister src2,
2093                              XMMRegister vtmp1, XMMRegister vtmp2) {
2094   switch (vlen) {
2095     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2096     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2097     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2098 
2099     default: assert(false, "wrong vector length");
2100   }
2101 }
2102 #endif // _LP64
2103 
2104 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2105   switch (vlen) {
2106     case 2:
2107       assert(vtmp2 == xnoreg, "");
2108       reduce2F(opcode, dst, src, vtmp1);
2109       break;
2110     case 4:
2111       assert(vtmp2 == xnoreg, "");
2112       reduce4F(opcode, dst, src, vtmp1);
2113       break;
2114     case 8:
2115       reduce8F(opcode, dst, src, vtmp1, vtmp2);
2116       break;
2117     case 16:
2118       reduce16F(opcode, dst, src, vtmp1, vtmp2);
2119       break;
2120     default: assert(false, "wrong vector length");
2121   }
2122 }
2123 
2124 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2125   switch (vlen) {
2126     case 2:
2127       assert(vtmp2 == xnoreg, "");
2128       reduce2D(opcode, dst, src, vtmp1);
2129       break;
2130     case 4:
2131       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2132       break;
2133     case 8:
2134       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2135       break;
2136     default: assert(false, "wrong vector length");
2137   }
2138 }
2139 
2140 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2141   if (opcode == Op_AddReductionVI) {
2142     if (vtmp1 != src2) {
2143       movdqu(vtmp1, src2);
2144     }
2145     phaddd(vtmp1, vtmp1);
2146   } else {
2147     pshufd(vtmp1, src2, 0x1);
2148     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2149   }
2150   movdl(vtmp2, src1);
2151   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2152   movdl(dst, vtmp1);
2153 }
2154 
2155 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2156   if (opcode == Op_AddReductionVI) {
2157     if (vtmp1 != src2) {
2158       movdqu(vtmp1, src2);
2159     }
2160     phaddd(vtmp1, src2);
2161     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2162   } else {
2163     pshufd(vtmp2, src2, 0xE);
2164     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2165     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2166   }
2167 }
2168 
2169 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2170   if (opcode == Op_AddReductionVI) {
2171     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2172     vextracti128_high(vtmp2, vtmp1);
2173     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2174     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2175   } else {
2176     vextracti128_high(vtmp1, src2);
2177     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2178     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2179   }
2180 }
2181 
2182 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2183   vextracti64x4_high(vtmp2, src2);
2184   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2185   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2186 }
2187 
2188 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2189   pshufd(vtmp2, src2, 0x1);
2190   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2191   movdqu(vtmp1, vtmp2);
2192   psrldq(vtmp1, 2);
2193   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2194   movdqu(vtmp2, vtmp1);
2195   psrldq(vtmp2, 1);
2196   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2197   movdl(vtmp2, src1);
2198   pmovsxbd(vtmp1, vtmp1);
2199   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2200   pextrb(dst, vtmp1, 0x0);
2201   movsbl(dst, dst);
2202 }
2203 
2204 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2205   pshufd(vtmp1, src2, 0xE);
2206   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2207   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2208 }
2209 
2210 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2211   vextracti128_high(vtmp2, src2);
2212   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2213   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2214 }
2215 
2216 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2217   vextracti64x4_high(vtmp1, src2);
2218   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2219   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2220 }
2221 
2222 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2223   pmovsxbw(vtmp2, src2);
2224   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2225 }
2226 
2227 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2228   if (UseAVX > 1) {
2229     int vector_len = Assembler::AVX_256bit;
2230     vpmovsxbw(vtmp1, src2, vector_len);
2231     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2232   } else {
2233     pmovsxbw(vtmp2, src2);
2234     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2235     pshufd(vtmp2, src2, 0x1);
2236     pmovsxbw(vtmp2, src2);
2237     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2238   }
2239 }
2240 
2241 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2242   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2243     int vector_len = Assembler::AVX_512bit;
2244     vpmovsxbw(vtmp1, src2, vector_len);
2245     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2246   } else {
2247     assert(UseAVX >= 2,"Should not reach here.");
2248     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2249     vextracti128_high(vtmp2, src2);
2250     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2251   }
2252 }
2253 
2254 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2255   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2256   vextracti64x4_high(vtmp2, src2);
2257   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2258 }
2259 
2260 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2261   if (opcode == Op_AddReductionVI) {
2262     if (vtmp1 != src2) {
2263       movdqu(vtmp1, src2);
2264     }
2265     phaddw(vtmp1, vtmp1);
2266     phaddw(vtmp1, vtmp1);
2267   } else {
2268     pshufd(vtmp2, src2, 0x1);
2269     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2270     movdqu(vtmp1, vtmp2);
2271     psrldq(vtmp1, 2);
2272     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2273   }
2274   movdl(vtmp2, src1);
2275   pmovsxwd(vtmp1, vtmp1);
2276   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2277   pextrw(dst, vtmp1, 0x0);
2278   movswl(dst, dst);
2279 }
2280 
2281 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2282   if (opcode == Op_AddReductionVI) {
2283     if (vtmp1 != src2) {
2284       movdqu(vtmp1, src2);
2285     }
2286     phaddw(vtmp1, src2);
2287   } else {
2288     pshufd(vtmp1, src2, 0xE);
2289     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2290   }
2291   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2292 }
2293 
2294 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2295   if (opcode == Op_AddReductionVI) {
2296     int vector_len = Assembler::AVX_256bit;
2297     vphaddw(vtmp2, src2, src2, vector_len);
2298     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2299   } else {
2300     vextracti128_high(vtmp2, src2);
2301     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2302   }
2303   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2304 }
2305 
2306 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2307   int vector_len = Assembler::AVX_256bit;
2308   vextracti64x4_high(vtmp1, src2);
2309   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2310   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2311 }
2312 
2313 #ifdef _LP64
2314 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2315   pshufd(vtmp2, src2, 0xE);
2316   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2317   movdq(vtmp1, src1);
2318   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2319   movdq(dst, vtmp1);
2320 }
2321 
2322 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2323   vextracti128_high(vtmp1, src2);
2324   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2325   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2326 }
2327 
2328 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2329   vextracti64x4_high(vtmp2, src2);
2330   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2331   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2332 }
2333 
2334 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2335   mov64(temp, -1L);
2336   bzhiq(temp, temp, len);
2337   kmovql(dst, temp);
2338 }
2339 #endif // _LP64
2340 
2341 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2342   reduce_operation_128(T_FLOAT, opcode, dst, src);
2343   pshufd(vtmp, src, 0x1);
2344   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2345 }
2346 
2347 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2348   reduce2F(opcode, dst, src, vtmp);
2349   pshufd(vtmp, src, 0x2);
2350   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2351   pshufd(vtmp, src, 0x3);
2352   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2353 }
2354 
2355 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2356   reduce4F(opcode, dst, src, vtmp2);
2357   vextractf128_high(vtmp2, src);
2358   reduce4F(opcode, dst, vtmp2, vtmp1);
2359 }
2360 
2361 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2362   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2363   vextracti64x4_high(vtmp1, src);
2364   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2365 }
2366 
2367 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2368   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2369   pshufd(vtmp, src, 0xE);
2370   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2371 }
2372 
2373 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2374   reduce2D(opcode, dst, src, vtmp2);
2375   vextractf128_high(vtmp2, src);
2376   reduce2D(opcode, dst, vtmp2, vtmp1);
2377 }
2378 
2379 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2380   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2381   vextracti64x4_high(vtmp1, src);
2382   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2383 }
2384 
2385 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2386   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2387 }
2388 
2389 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2390   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2391 }
2392 
2393 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2394                                  int vec_enc) {
2395   switch(elem_bt) {
2396     case T_INT:
2397     case T_FLOAT:
2398       vmaskmovps(dst, src, mask, vec_enc);
2399       break;
2400     case T_LONG:
2401     case T_DOUBLE:
2402       vmaskmovpd(dst, src, mask, vec_enc);
2403       break;
2404     default:
2405       fatal("Unsupported type %s", type2name(elem_bt));
2406       break;
2407   }
2408 }
2409 
2410 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2411                                  int vec_enc) {
2412   switch(elem_bt) {
2413     case T_INT:
2414     case T_FLOAT:
2415       vmaskmovps(dst, src, mask, vec_enc);
2416       break;
2417     case T_LONG:
2418     case T_DOUBLE:
2419       vmaskmovpd(dst, src, mask, vec_enc);
2420       break;
2421     default:
2422       fatal("Unsupported type %s", type2name(elem_bt));
2423       break;
2424   }
2425 }
2426 
2427 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2428                                           XMMRegister dst, XMMRegister src,
2429                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2430                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2431   const int permconst[] = {1, 14};
2432   XMMRegister wsrc = src;
2433   XMMRegister wdst = xmm_0;
2434   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2435 
2436   int vlen_enc = Assembler::AVX_128bit;
2437   if (vlen == 16) {
2438     vlen_enc = Assembler::AVX_256bit;
2439   }
2440 
2441   for (int i = log2(vlen) - 1; i >=0; i--) {
2442     if (i == 0 && !is_dst_valid) {
2443       wdst = dst;
2444     }
2445     if (i == 3) {
2446       vextracti64x4_high(wtmp, wsrc);
2447     } else if (i == 2) {
2448       vextracti128_high(wtmp, wsrc);
2449     } else { // i = [0,1]
2450       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2451     }
2452     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2453     wsrc = wdst;
2454     vlen_enc = Assembler::AVX_128bit;
2455   }
2456   if (is_dst_valid) {
2457     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2458   }
2459 }
2460 
2461 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2462                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2463                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2464   XMMRegister wsrc = src;
2465   XMMRegister wdst = xmm_0;
2466   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2467   int vlen_enc = Assembler::AVX_128bit;
2468   if (vlen == 8) {
2469     vlen_enc = Assembler::AVX_256bit;
2470   }
2471   for (int i = log2(vlen) - 1; i >=0; i--) {
2472     if (i == 0 && !is_dst_valid) {
2473       wdst = dst;
2474     }
2475     if (i == 1) {
2476       vextracti128_high(wtmp, wsrc);
2477     } else if (i == 2) {
2478       vextracti64x4_high(wtmp, wsrc);
2479     } else {
2480       assert(i == 0, "%d", i);
2481       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2482     }
2483     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2484     wsrc = wdst;
2485     vlen_enc = Assembler::AVX_128bit;
2486   }
2487   if (is_dst_valid) {
2488     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2489   }
2490 }
2491 
2492 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2493   switch (bt) {
2494     case T_BYTE:  pextrb(dst, src, idx); break;
2495     case T_SHORT: pextrw(dst, src, idx); break;
2496     case T_INT:   pextrd(dst, src, idx); break;
2497     case T_LONG:  pextrq(dst, src, idx); break;
2498 
2499     default:
2500       assert(false,"Should not reach here.");
2501       break;
2502   }
2503 }
2504 
2505 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2506   int esize =  type2aelembytes(typ);
2507   int elem_per_lane = 16/esize;
2508   int lane = elemindex / elem_per_lane;
2509   int eindex = elemindex % elem_per_lane;
2510 
2511   if (lane >= 2) {
2512     assert(UseAVX > 2, "required");
2513     vextractf32x4(dst, src, lane & 3);
2514     return dst;
2515   } else if (lane > 0) {
2516     assert(UseAVX > 0, "required");
2517     vextractf128(dst, src, lane);
2518     return dst;
2519   } else {
2520     return src;
2521   }
2522 }
2523 
2524 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2525   if (typ == T_BYTE) {
2526     movsbl(dst, dst);
2527   } else if (typ == T_SHORT) {
2528     movswl(dst, dst);
2529   }
2530 }
2531 
2532 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2533   int esize =  type2aelembytes(typ);
2534   int elem_per_lane = 16/esize;
2535   int eindex = elemindex % elem_per_lane;
2536   assert(is_integral_type(typ),"required");
2537 
2538   if (eindex == 0) {
2539     if (typ == T_LONG) {
2540       movq(dst, src);
2541     } else {
2542       movdl(dst, src);
2543       movsxl(typ, dst);
2544     }
2545   } else {
2546     extract(typ, dst, src, eindex);
2547     movsxl(typ, dst);
2548   }
2549 }
2550 
2551 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2552   int esize =  type2aelembytes(typ);
2553   int elem_per_lane = 16/esize;
2554   int eindex = elemindex % elem_per_lane;
2555   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2556 
2557   if (eindex == 0) {
2558     movq(dst, src);
2559   } else {
2560     if (typ == T_FLOAT) {
2561       if (UseAVX == 0) {
2562         movdqu(dst, src);
2563         shufps(dst, dst, eindex);
2564       } else {
2565         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2566       }
2567     } else {
2568       if (UseAVX == 0) {
2569         movdqu(dst, src);
2570         psrldq(dst, eindex*esize);
2571       } else {
2572         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2573       }
2574       movq(dst, dst);
2575     }
2576   }
2577   // Zero upper bits
2578   if (typ == T_FLOAT) {
2579     if (UseAVX == 0) {
2580       assert(vtmp != xnoreg, "required.");
2581       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2582       pand(dst, vtmp);
2583     } else {
2584       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2585     }
2586   }
2587 }
2588 
2589 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2590   switch(typ) {
2591     case T_BYTE:
2592     case T_BOOLEAN:
2593       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2594       break;
2595     case T_SHORT:
2596     case T_CHAR:
2597       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2598       break;
2599     case T_INT:
2600     case T_FLOAT:
2601       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2602       break;
2603     case T_LONG:
2604     case T_DOUBLE:
2605       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2606       break;
2607     default:
2608       assert(false,"Should not reach here.");
2609       break;
2610   }
2611 }
2612 
2613 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2614   assert(rscratch != noreg || always_reachable(src2), "missing");
2615 
2616   switch(typ) {
2617     case T_BOOLEAN:
2618     case T_BYTE:
2619       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2620       break;
2621     case T_CHAR:
2622     case T_SHORT:
2623       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2624       break;
2625     case T_INT:
2626     case T_FLOAT:
2627       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2628       break;
2629     case T_LONG:
2630     case T_DOUBLE:
2631       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2632       break;
2633     default:
2634       assert(false,"Should not reach here.");
2635       break;
2636   }
2637 }
2638 
2639 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2640   switch(typ) {
2641     case T_BYTE:
2642       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2643       break;
2644     case T_SHORT:
2645       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2646       break;
2647     case T_INT:
2648     case T_FLOAT:
2649       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2650       break;
2651     case T_LONG:
2652     case T_DOUBLE:
2653       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2654       break;
2655     default:
2656       assert(false,"Should not reach here.");
2657       break;
2658   }
2659 }
2660 
2661 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2662   assert(vlen_in_bytes <= 32, "");
2663   int esize = type2aelembytes(bt);
2664   if (vlen_in_bytes == 32) {
2665     assert(vtmp == xnoreg, "required.");
2666     if (esize >= 4) {
2667       vtestps(src1, src2, AVX_256bit);
2668     } else {
2669       vptest(src1, src2, AVX_256bit);
2670     }
2671     return;
2672   }
2673   if (vlen_in_bytes < 16) {
2674     // Duplicate the lower part to fill the whole register,
2675     // Don't need to do so for src2
2676     assert(vtmp != xnoreg, "required");
2677     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2678     pshufd(vtmp, src1, shuffle_imm);
2679   } else {
2680     assert(vtmp == xnoreg, "required");
2681     vtmp = src1;
2682   }
2683   if (esize >= 4 && VM_Version::supports_avx()) {
2684     vtestps(vtmp, src2, AVX_128bit);
2685   } else {
2686     ptest(vtmp, src2);
2687   }
2688 }
2689 
2690 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2691   assert(UseAVX >= 2, "required");
2692 #ifdef ASSERT
2693   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2694   bool is_bw_supported = VM_Version::supports_avx512bw();
2695   if (is_bw && !is_bw_supported) {
2696     assert(vlen_enc != Assembler::AVX_512bit, "required");
2697     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2698            "XMM register should be 0-15");
2699   }
2700 #endif // ASSERT
2701   switch (elem_bt) {
2702     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2703     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2704     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2705     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2706     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2707     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2708     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2709   }
2710 }
2711 
2712 #ifdef _LP64
2713 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2714   assert(UseAVX >= 2, "required");
2715   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2716   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2717   if ((UseAVX > 2) &&
2718       (!is_bw || VM_Version::supports_avx512bw()) &&
2719       (!is_vl || VM_Version::supports_avx512vl())) {
2720     switch (elem_bt) {
2721       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2722       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2723       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2724       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2725       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2726     }
2727   } else {
2728     assert(vlen_enc != Assembler::AVX_512bit, "required");
2729     assert((dst->encoding() < 16),"XMM register should be 0-15");
2730     switch (elem_bt) {
2731       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2732       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2733       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2734       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2735       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2736       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2737       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2738     }
2739   }
2740 }
2741 #endif
2742 
2743 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2744   switch (to_elem_bt) {
2745     case T_SHORT:
2746       vpmovsxbw(dst, src, vlen_enc);
2747       break;
2748     case T_INT:
2749       vpmovsxbd(dst, src, vlen_enc);
2750       break;
2751     case T_FLOAT:
2752       vpmovsxbd(dst, src, vlen_enc);
2753       vcvtdq2ps(dst, dst, vlen_enc);
2754       break;
2755     case T_LONG:
2756       vpmovsxbq(dst, src, vlen_enc);
2757       break;
2758     case T_DOUBLE: {
2759       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2760       vpmovsxbd(dst, src, mid_vlen_enc);
2761       vcvtdq2pd(dst, dst, vlen_enc);
2762       break;
2763     }
2764     default:
2765       fatal("Unsupported type %s", type2name(to_elem_bt));
2766       break;
2767   }
2768 }
2769 
2770 //-------------------------------------------------------------------------------------------
2771 
2772 // IndexOf for constant substrings with size >= 8 chars
2773 // which don't need to be loaded through stack.
2774 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2775                                          Register cnt1, Register cnt2,
2776                                          int int_cnt2,  Register result,
2777                                          XMMRegister vec, Register tmp,
2778                                          int ae) {
2779   ShortBranchVerifier sbv(this);
2780   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2781   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2782 
2783   // This method uses the pcmpestri instruction with bound registers
2784   //   inputs:
2785   //     xmm - substring
2786   //     rax - substring length (elements count)
2787   //     mem - scanned string
2788   //     rdx - string length (elements count)
2789   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2790   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2791   //   outputs:
2792   //     rcx - matched index in string
2793   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2794   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2795   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2796   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2797   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2798 
2799   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2800         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2801         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2802 
2803   // Note, inline_string_indexOf() generates checks:
2804   // if (substr.count > string.count) return -1;
2805   // if (substr.count == 0) return 0;
2806   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2807 
2808   // Load substring.
2809   if (ae == StrIntrinsicNode::UL) {
2810     pmovzxbw(vec, Address(str2, 0));
2811   } else {
2812     movdqu(vec, Address(str2, 0));
2813   }
2814   movl(cnt2, int_cnt2);
2815   movptr(result, str1); // string addr
2816 
2817   if (int_cnt2 > stride) {
2818     jmpb(SCAN_TO_SUBSTR);
2819 
2820     // Reload substr for rescan, this code
2821     // is executed only for large substrings (> 8 chars)
2822     bind(RELOAD_SUBSTR);
2823     if (ae == StrIntrinsicNode::UL) {
2824       pmovzxbw(vec, Address(str2, 0));
2825     } else {
2826       movdqu(vec, Address(str2, 0));
2827     }
2828     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2829 
2830     bind(RELOAD_STR);
2831     // We came here after the beginning of the substring was
2832     // matched but the rest of it was not so we need to search
2833     // again. Start from the next element after the previous match.
2834 
2835     // cnt2 is number of substring reminding elements and
2836     // cnt1 is number of string reminding elements when cmp failed.
2837     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2838     subl(cnt1, cnt2);
2839     addl(cnt1, int_cnt2);
2840     movl(cnt2, int_cnt2); // Now restore cnt2
2841 
2842     decrementl(cnt1);     // Shift to next element
2843     cmpl(cnt1, cnt2);
2844     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2845 
2846     addptr(result, (1<<scale1));
2847 
2848   } // (int_cnt2 > 8)
2849 
2850   // Scan string for start of substr in 16-byte vectors
2851   bind(SCAN_TO_SUBSTR);
2852   pcmpestri(vec, Address(result, 0), mode);
2853   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2854   subl(cnt1, stride);
2855   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2856   cmpl(cnt1, cnt2);
2857   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2858   addptr(result, 16);
2859   jmpb(SCAN_TO_SUBSTR);
2860 
2861   // Found a potential substr
2862   bind(FOUND_CANDIDATE);
2863   // Matched whole vector if first element matched (tmp(rcx) == 0).
2864   if (int_cnt2 == stride) {
2865     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2866   } else { // int_cnt2 > 8
2867     jccb(Assembler::overflow, FOUND_SUBSTR);
2868   }
2869   // After pcmpestri tmp(rcx) contains matched element index
2870   // Compute start addr of substr
2871   lea(result, Address(result, tmp, scale1));
2872 
2873   // Make sure string is still long enough
2874   subl(cnt1, tmp);
2875   cmpl(cnt1, cnt2);
2876   if (int_cnt2 == stride) {
2877     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2878   } else { // int_cnt2 > 8
2879     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2880   }
2881   // Left less then substring.
2882 
2883   bind(RET_NOT_FOUND);
2884   movl(result, -1);
2885   jmp(EXIT);
2886 
2887   if (int_cnt2 > stride) {
2888     // This code is optimized for the case when whole substring
2889     // is matched if its head is matched.
2890     bind(MATCH_SUBSTR_HEAD);
2891     pcmpestri(vec, Address(result, 0), mode);
2892     // Reload only string if does not match
2893     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2894 
2895     Label CONT_SCAN_SUBSTR;
2896     // Compare the rest of substring (> 8 chars).
2897     bind(FOUND_SUBSTR);
2898     // First 8 chars are already matched.
2899     negptr(cnt2);
2900     addptr(cnt2, stride);
2901 
2902     bind(SCAN_SUBSTR);
2903     subl(cnt1, stride);
2904     cmpl(cnt2, -stride); // Do not read beyond substring
2905     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2906     // Back-up strings to avoid reading beyond substring:
2907     // cnt1 = cnt1 - cnt2 + 8
2908     addl(cnt1, cnt2); // cnt2 is negative
2909     addl(cnt1, stride);
2910     movl(cnt2, stride); negptr(cnt2);
2911     bind(CONT_SCAN_SUBSTR);
2912     if (int_cnt2 < (int)G) {
2913       int tail_off1 = int_cnt2<<scale1;
2914       int tail_off2 = int_cnt2<<scale2;
2915       if (ae == StrIntrinsicNode::UL) {
2916         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2917       } else {
2918         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2919       }
2920       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2921     } else {
2922       // calculate index in register to avoid integer overflow (int_cnt2*2)
2923       movl(tmp, int_cnt2);
2924       addptr(tmp, cnt2);
2925       if (ae == StrIntrinsicNode::UL) {
2926         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2927       } else {
2928         movdqu(vec, Address(str2, tmp, scale2, 0));
2929       }
2930       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2931     }
2932     // Need to reload strings pointers if not matched whole vector
2933     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2934     addptr(cnt2, stride);
2935     jcc(Assembler::negative, SCAN_SUBSTR);
2936     // Fall through if found full substring
2937 
2938   } // (int_cnt2 > 8)
2939 
2940   bind(RET_FOUND);
2941   // Found result if we matched full small substring.
2942   // Compute substr offset
2943   subptr(result, str1);
2944   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2945     shrl(result, 1); // index
2946   }
2947   bind(EXIT);
2948 
2949 } // string_indexofC8
2950 
2951 // Small strings are loaded through stack if they cross page boundary.
2952 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2953                                        Register cnt1, Register cnt2,
2954                                        int int_cnt2,  Register result,
2955                                        XMMRegister vec, Register tmp,
2956                                        int ae) {
2957   ShortBranchVerifier sbv(this);
2958   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2959   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2960 
2961   //
2962   // int_cnt2 is length of small (< 8 chars) constant substring
2963   // or (-1) for non constant substring in which case its length
2964   // is in cnt2 register.
2965   //
2966   // Note, inline_string_indexOf() generates checks:
2967   // if (substr.count > string.count) return -1;
2968   // if (substr.count == 0) return 0;
2969   //
2970   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2971   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2972   // This method uses the pcmpestri instruction with bound registers
2973   //   inputs:
2974   //     xmm - substring
2975   //     rax - substring length (elements count)
2976   //     mem - scanned string
2977   //     rdx - string length (elements count)
2978   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2979   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2980   //   outputs:
2981   //     rcx - matched index in string
2982   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2983   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2984   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2985   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2986 
2987   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2988         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2989         FOUND_CANDIDATE;
2990 
2991   { //========================================================
2992     // We don't know where these strings are located
2993     // and we can't read beyond them. Load them through stack.
2994     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2995 
2996     movptr(tmp, rsp); // save old SP
2997 
2998     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2999       if (int_cnt2 == (1>>scale2)) { // One byte
3000         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
3001         load_unsigned_byte(result, Address(str2, 0));
3002         movdl(vec, result); // move 32 bits
3003       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
3004         // Not enough header space in 32-bit VM: 12+3 = 15.
3005         movl(result, Address(str2, -1));
3006         shrl(result, 8);
3007         movdl(vec, result); // move 32 bits
3008       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
3009         load_unsigned_short(result, Address(str2, 0));
3010         movdl(vec, result); // move 32 bits
3011       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
3012         movdl(vec, Address(str2, 0)); // move 32 bits
3013       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
3014         movq(vec, Address(str2, 0));  // move 64 bits
3015       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3016         // Array header size is 12 bytes in 32-bit VM
3017         // + 6 bytes for 3 chars == 18 bytes,
3018         // enough space to load vec and shift.
3019         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3020         if (ae == StrIntrinsicNode::UL) {
3021           int tail_off = int_cnt2-8;
3022           pmovzxbw(vec, Address(str2, tail_off));
3023           psrldq(vec, -2*tail_off);
3024         }
3025         else {
3026           int tail_off = int_cnt2*(1<<scale2);
3027           movdqu(vec, Address(str2, tail_off-16));
3028           psrldq(vec, 16-tail_off);
3029         }
3030       }
3031     } else { // not constant substring
3032       cmpl(cnt2, stride);
3033       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3034 
3035       // We can read beyond string if srt+16 does not cross page boundary
3036       // since heaps are aligned and mapped by pages.
3037       assert(os::vm_page_size() < (int)G, "default page should be small");
3038       movl(result, str2); // We need only low 32 bits
3039       andl(result, ((int)os::vm_page_size()-1));
3040       cmpl(result, ((int)os::vm_page_size()-16));
3041       jccb(Assembler::belowEqual, CHECK_STR);
3042 
3043       // Move small strings to stack to allow load 16 bytes into vec.
3044       subptr(rsp, 16);
3045       int stk_offset = wordSize-(1<<scale2);
3046       push(cnt2);
3047 
3048       bind(COPY_SUBSTR);
3049       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3050         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3051         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3052       } else if (ae == StrIntrinsicNode::UU) {
3053         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3054         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3055       }
3056       decrement(cnt2);
3057       jccb(Assembler::notZero, COPY_SUBSTR);
3058 
3059       pop(cnt2);
3060       movptr(str2, rsp);  // New substring address
3061     } // non constant
3062 
3063     bind(CHECK_STR);
3064     cmpl(cnt1, stride);
3065     jccb(Assembler::aboveEqual, BIG_STRINGS);
3066 
3067     // Check cross page boundary.
3068     movl(result, str1); // We need only low 32 bits
3069     andl(result, ((int)os::vm_page_size()-1));
3070     cmpl(result, ((int)os::vm_page_size()-16));
3071     jccb(Assembler::belowEqual, BIG_STRINGS);
3072 
3073     subptr(rsp, 16);
3074     int stk_offset = -(1<<scale1);
3075     if (int_cnt2 < 0) { // not constant
3076       push(cnt2);
3077       stk_offset += wordSize;
3078     }
3079     movl(cnt2, cnt1);
3080 
3081     bind(COPY_STR);
3082     if (ae == StrIntrinsicNode::LL) {
3083       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3084       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3085     } else {
3086       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3087       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3088     }
3089     decrement(cnt2);
3090     jccb(Assembler::notZero, COPY_STR);
3091 
3092     if (int_cnt2 < 0) { // not constant
3093       pop(cnt2);
3094     }
3095     movptr(str1, rsp);  // New string address
3096 
3097     bind(BIG_STRINGS);
3098     // Load substring.
3099     if (int_cnt2 < 0) { // -1
3100       if (ae == StrIntrinsicNode::UL) {
3101         pmovzxbw(vec, Address(str2, 0));
3102       } else {
3103         movdqu(vec, Address(str2, 0));
3104       }
3105       push(cnt2);       // substr count
3106       push(str2);       // substr addr
3107       push(str1);       // string addr
3108     } else {
3109       // Small (< 8 chars) constant substrings are loaded already.
3110       movl(cnt2, int_cnt2);
3111     }
3112     push(tmp);  // original SP
3113 
3114   } // Finished loading
3115 
3116   //========================================================
3117   // Start search
3118   //
3119 
3120   movptr(result, str1); // string addr
3121 
3122   if (int_cnt2  < 0) {  // Only for non constant substring
3123     jmpb(SCAN_TO_SUBSTR);
3124 
3125     // SP saved at sp+0
3126     // String saved at sp+1*wordSize
3127     // Substr saved at sp+2*wordSize
3128     // Substr count saved at sp+3*wordSize
3129 
3130     // Reload substr for rescan, this code
3131     // is executed only for large substrings (> 8 chars)
3132     bind(RELOAD_SUBSTR);
3133     movptr(str2, Address(rsp, 2*wordSize));
3134     movl(cnt2, Address(rsp, 3*wordSize));
3135     if (ae == StrIntrinsicNode::UL) {
3136       pmovzxbw(vec, Address(str2, 0));
3137     } else {
3138       movdqu(vec, Address(str2, 0));
3139     }
3140     // We came here after the beginning of the substring was
3141     // matched but the rest of it was not so we need to search
3142     // again. Start from the next element after the previous match.
3143     subptr(str1, result); // Restore counter
3144     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3145       shrl(str1, 1);
3146     }
3147     addl(cnt1, str1);
3148     decrementl(cnt1);   // Shift to next element
3149     cmpl(cnt1, cnt2);
3150     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3151 
3152     addptr(result, (1<<scale1));
3153   } // non constant
3154 
3155   // Scan string for start of substr in 16-byte vectors
3156   bind(SCAN_TO_SUBSTR);
3157   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3158   pcmpestri(vec, Address(result, 0), mode);
3159   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3160   subl(cnt1, stride);
3161   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3162   cmpl(cnt1, cnt2);
3163   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3164   addptr(result, 16);
3165 
3166   bind(ADJUST_STR);
3167   cmpl(cnt1, stride); // Do not read beyond string
3168   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3169   // Back-up string to avoid reading beyond string.
3170   lea(result, Address(result, cnt1, scale1, -16));
3171   movl(cnt1, stride);
3172   jmpb(SCAN_TO_SUBSTR);
3173 
3174   // Found a potential substr
3175   bind(FOUND_CANDIDATE);
3176   // After pcmpestri tmp(rcx) contains matched element index
3177 
3178   // Make sure string is still long enough
3179   subl(cnt1, tmp);
3180   cmpl(cnt1, cnt2);
3181   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3182   // Left less then substring.
3183 
3184   bind(RET_NOT_FOUND);
3185   movl(result, -1);
3186   jmp(CLEANUP);
3187 
3188   bind(FOUND_SUBSTR);
3189   // Compute start addr of substr
3190   lea(result, Address(result, tmp, scale1));
3191   if (int_cnt2 > 0) { // Constant substring
3192     // Repeat search for small substring (< 8 chars)
3193     // from new point without reloading substring.
3194     // Have to check that we don't read beyond string.
3195     cmpl(tmp, stride-int_cnt2);
3196     jccb(Assembler::greater, ADJUST_STR);
3197     // Fall through if matched whole substring.
3198   } else { // non constant
3199     assert(int_cnt2 == -1, "should be != 0");
3200 
3201     addl(tmp, cnt2);
3202     // Found result if we matched whole substring.
3203     cmpl(tmp, stride);
3204     jcc(Assembler::lessEqual, RET_FOUND);
3205 
3206     // Repeat search for small substring (<= 8 chars)
3207     // from new point 'str1' without reloading substring.
3208     cmpl(cnt2, stride);
3209     // Have to check that we don't read beyond string.
3210     jccb(Assembler::lessEqual, ADJUST_STR);
3211 
3212     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3213     // Compare the rest of substring (> 8 chars).
3214     movptr(str1, result);
3215 
3216     cmpl(tmp, cnt2);
3217     // First 8 chars are already matched.
3218     jccb(Assembler::equal, CHECK_NEXT);
3219 
3220     bind(SCAN_SUBSTR);
3221     pcmpestri(vec, Address(str1, 0), mode);
3222     // Need to reload strings pointers if not matched whole vector
3223     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3224 
3225     bind(CHECK_NEXT);
3226     subl(cnt2, stride);
3227     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3228     addptr(str1, 16);
3229     if (ae == StrIntrinsicNode::UL) {
3230       addptr(str2, 8);
3231     } else {
3232       addptr(str2, 16);
3233     }
3234     subl(cnt1, stride);
3235     cmpl(cnt2, stride); // Do not read beyond substring
3236     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3237     // Back-up strings to avoid reading beyond substring.
3238 
3239     if (ae == StrIntrinsicNode::UL) {
3240       lea(str2, Address(str2, cnt2, scale2, -8));
3241       lea(str1, Address(str1, cnt2, scale1, -16));
3242     } else {
3243       lea(str2, Address(str2, cnt2, scale2, -16));
3244       lea(str1, Address(str1, cnt2, scale1, -16));
3245     }
3246     subl(cnt1, cnt2);
3247     movl(cnt2, stride);
3248     addl(cnt1, stride);
3249     bind(CONT_SCAN_SUBSTR);
3250     if (ae == StrIntrinsicNode::UL) {
3251       pmovzxbw(vec, Address(str2, 0));
3252     } else {
3253       movdqu(vec, Address(str2, 0));
3254     }
3255     jmp(SCAN_SUBSTR);
3256 
3257     bind(RET_FOUND_LONG);
3258     movptr(str1, Address(rsp, wordSize));
3259   } // non constant
3260 
3261   bind(RET_FOUND);
3262   // Compute substr offset
3263   subptr(result, str1);
3264   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3265     shrl(result, 1); // index
3266   }
3267   bind(CLEANUP);
3268   pop(rsp); // restore SP
3269 
3270 } // string_indexof
3271 
3272 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3273                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3274   ShortBranchVerifier sbv(this);
3275   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3276 
3277   int stride = 8;
3278 
3279   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3280         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3281         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3282         FOUND_SEQ_CHAR, DONE_LABEL;
3283 
3284   movptr(result, str1);
3285   if (UseAVX >= 2) {
3286     cmpl(cnt1, stride);
3287     jcc(Assembler::less, SCAN_TO_CHAR);
3288     cmpl(cnt1, 2*stride);
3289     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3290     movdl(vec1, ch);
3291     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3292     vpxor(vec2, vec2);
3293     movl(tmp, cnt1);
3294     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3295     andl(cnt1,0x0000000F);  //tail count (in chars)
3296 
3297     bind(SCAN_TO_16_CHAR_LOOP);
3298     vmovdqu(vec3, Address(result, 0));
3299     vpcmpeqw(vec3, vec3, vec1, 1);
3300     vptest(vec2, vec3);
3301     jcc(Assembler::carryClear, FOUND_CHAR);
3302     addptr(result, 32);
3303     subl(tmp, 2*stride);
3304     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3305     jmp(SCAN_TO_8_CHAR);
3306     bind(SCAN_TO_8_CHAR_INIT);
3307     movdl(vec1, ch);
3308     pshuflw(vec1, vec1, 0x00);
3309     pshufd(vec1, vec1, 0);
3310     pxor(vec2, vec2);
3311   }
3312   bind(SCAN_TO_8_CHAR);
3313   cmpl(cnt1, stride);
3314   jcc(Assembler::less, SCAN_TO_CHAR);
3315   if (UseAVX < 2) {
3316     movdl(vec1, ch);
3317     pshuflw(vec1, vec1, 0x00);
3318     pshufd(vec1, vec1, 0);
3319     pxor(vec2, vec2);
3320   }
3321   movl(tmp, cnt1);
3322   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3323   andl(cnt1,0x00000007);  //tail count (in chars)
3324 
3325   bind(SCAN_TO_8_CHAR_LOOP);
3326   movdqu(vec3, Address(result, 0));
3327   pcmpeqw(vec3, vec1);
3328   ptest(vec2, vec3);
3329   jcc(Assembler::carryClear, FOUND_CHAR);
3330   addptr(result, 16);
3331   subl(tmp, stride);
3332   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3333   bind(SCAN_TO_CHAR);
3334   testl(cnt1, cnt1);
3335   jcc(Assembler::zero, RET_NOT_FOUND);
3336   bind(SCAN_TO_CHAR_LOOP);
3337   load_unsigned_short(tmp, Address(result, 0));
3338   cmpl(ch, tmp);
3339   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3340   addptr(result, 2);
3341   subl(cnt1, 1);
3342   jccb(Assembler::zero, RET_NOT_FOUND);
3343   jmp(SCAN_TO_CHAR_LOOP);
3344 
3345   bind(RET_NOT_FOUND);
3346   movl(result, -1);
3347   jmpb(DONE_LABEL);
3348 
3349   bind(FOUND_CHAR);
3350   if (UseAVX >= 2) {
3351     vpmovmskb(tmp, vec3);
3352   } else {
3353     pmovmskb(tmp, vec3);
3354   }
3355   bsfl(ch, tmp);
3356   addptr(result, ch);
3357 
3358   bind(FOUND_SEQ_CHAR);
3359   subptr(result, str1);
3360   shrl(result, 1);
3361 
3362   bind(DONE_LABEL);
3363 } // string_indexof_char
3364 
3365 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3366                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3367   ShortBranchVerifier sbv(this);
3368   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3369 
3370   int stride = 16;
3371 
3372   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3373         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3374         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3375         FOUND_SEQ_CHAR, DONE_LABEL;
3376 
3377   movptr(result, str1);
3378   if (UseAVX >= 2) {
3379     cmpl(cnt1, stride);
3380     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3381     cmpl(cnt1, stride*2);
3382     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3383     movdl(vec1, ch);
3384     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3385     vpxor(vec2, vec2);
3386     movl(tmp, cnt1);
3387     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3388     andl(cnt1,0x0000001F);  //tail count (in chars)
3389 
3390     bind(SCAN_TO_32_CHAR_LOOP);
3391     vmovdqu(vec3, Address(result, 0));
3392     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3393     vptest(vec2, vec3);
3394     jcc(Assembler::carryClear, FOUND_CHAR);
3395     addptr(result, 32);
3396     subl(tmp, stride*2);
3397     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3398     jmp(SCAN_TO_16_CHAR);
3399 
3400     bind(SCAN_TO_16_CHAR_INIT);
3401     movdl(vec1, ch);
3402     pxor(vec2, vec2);
3403     pshufb(vec1, vec2);
3404   }
3405 
3406   bind(SCAN_TO_16_CHAR);
3407   cmpl(cnt1, stride);
3408   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3409   if (UseAVX < 2) {
3410     movdl(vec1, ch);
3411     pxor(vec2, vec2);
3412     pshufb(vec1, vec2);
3413   }
3414   movl(tmp, cnt1);
3415   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3416   andl(cnt1,0x0000000F);  //tail count (in bytes)
3417 
3418   bind(SCAN_TO_16_CHAR_LOOP);
3419   movdqu(vec3, Address(result, 0));
3420   pcmpeqb(vec3, vec1);
3421   ptest(vec2, vec3);
3422   jcc(Assembler::carryClear, FOUND_CHAR);
3423   addptr(result, 16);
3424   subl(tmp, stride);
3425   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3426 
3427   bind(SCAN_TO_CHAR_INIT);
3428   testl(cnt1, cnt1);
3429   jcc(Assembler::zero, RET_NOT_FOUND);
3430   bind(SCAN_TO_CHAR_LOOP);
3431   load_unsigned_byte(tmp, Address(result, 0));
3432   cmpl(ch, tmp);
3433   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3434   addptr(result, 1);
3435   subl(cnt1, 1);
3436   jccb(Assembler::zero, RET_NOT_FOUND);
3437   jmp(SCAN_TO_CHAR_LOOP);
3438 
3439   bind(RET_NOT_FOUND);
3440   movl(result, -1);
3441   jmpb(DONE_LABEL);
3442 
3443   bind(FOUND_CHAR);
3444   if (UseAVX >= 2) {
3445     vpmovmskb(tmp, vec3);
3446   } else {
3447     pmovmskb(tmp, vec3);
3448   }
3449   bsfl(ch, tmp);
3450   addptr(result, ch);
3451 
3452   bind(FOUND_SEQ_CHAR);
3453   subptr(result, str1);
3454 
3455   bind(DONE_LABEL);
3456 } // stringL_indexof_char
3457 
3458 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3459   switch (eltype) {
3460   case T_BOOLEAN: return sizeof(jboolean);
3461   case T_BYTE:  return sizeof(jbyte);
3462   case T_SHORT: return sizeof(jshort);
3463   case T_CHAR:  return sizeof(jchar);
3464   case T_INT:   return sizeof(jint);
3465   default:
3466     ShouldNotReachHere();
3467     return -1;
3468   }
3469 }
3470 
3471 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3472   switch (eltype) {
3473   // T_BOOLEAN used as surrogate for unsigned byte
3474   case T_BOOLEAN: movzbl(dst, src);   break;
3475   case T_BYTE:    movsbl(dst, src);   break;
3476   case T_SHORT:   movswl(dst, src);   break;
3477   case T_CHAR:    movzwl(dst, src);   break;
3478   case T_INT:     movl(dst, src);     break;
3479   default:
3480     ShouldNotReachHere();
3481   }
3482 }
3483 
3484 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3485   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3486 }
3487 
3488 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3489   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3490 }
3491 
3492 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3493   const int vlen = Assembler::AVX_256bit;
3494   switch (eltype) {
3495   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3496   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3497   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3498   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3499   case T_INT:
3500     // do nothing
3501     break;
3502   default:
3503     ShouldNotReachHere();
3504   }
3505 }
3506 
3507 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3508                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3509                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3510                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3511                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3512                                         BasicType eltype) {
3513   ShortBranchVerifier sbv(this);
3514   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3515   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3516   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3517 
3518   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3519         SHORT_UNROLLED_LOOP_EXIT,
3520         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3521         UNROLLED_VECTOR_LOOP_BEGIN,
3522         END;
3523   switch (eltype) {
3524   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3525   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3526   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3527   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3528   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3529   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3530   }
3531 
3532   // For "renaming" for readibility of the code
3533   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3534                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3535                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3536 
3537   const int elsize = arrays_hashcode_elsize(eltype);
3538 
3539   /*
3540     if (cnt1 >= 2) {
3541       if (cnt1 >= 32) {
3542         UNROLLED VECTOR LOOP
3543       }
3544       UNROLLED SCALAR LOOP
3545     }
3546     SINGLE SCALAR
3547    */
3548 
3549   cmpl(cnt1, 32);
3550   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3551 
3552   // cnt1 >= 32 && generate_vectorized_loop
3553   xorl(index, index);
3554 
3555   // vresult = IntVector.zero(I256);
3556   for (int idx = 0; idx < 4; idx++) {
3557     vpxor(vresult[idx], vresult[idx]);
3558   }
3559   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3560   Register bound = tmp2;
3561   Register next = tmp3;
3562   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3563   movl(next, Address(tmp2, 0));
3564   movdl(vnext, next);
3565   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3566 
3567   // index = 0;
3568   // bound = cnt1 & ~(32 - 1);
3569   movl(bound, cnt1);
3570   andl(bound, ~(32 - 1));
3571   // for (; index < bound; index += 32) {
3572   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3573   // result *= next;
3574   imull(result, next);
3575   // loop fission to upfront the cost of fetching from memory, OOO execution
3576   // can then hopefully do a better job of prefetching
3577   for (int idx = 0; idx < 4; idx++) {
3578     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3579   }
3580   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3581   for (int idx = 0; idx < 4; idx++) {
3582     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3583     arrays_hashcode_elvcast(vtmp[idx], eltype);
3584     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3585   }
3586   // index += 32;
3587   addl(index, 32);
3588   // index < bound;
3589   cmpl(index, bound);
3590   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3591   // }
3592 
3593   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3594   subl(cnt1, bound);
3595   // release bound
3596 
3597   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3598   for (int idx = 0; idx < 4; idx++) {
3599     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3600     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3601     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3602   }
3603   // result += vresult.reduceLanes(ADD);
3604   for (int idx = 0; idx < 4; idx++) {
3605     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3606   }
3607 
3608   // } else if (cnt1 < 32) {
3609 
3610   bind(SHORT_UNROLLED_BEGIN);
3611   // int i = 1;
3612   movl(index, 1);
3613   cmpl(index, cnt1);
3614   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3615 
3616   // for (; i < cnt1 ; i += 2) {
3617   bind(SHORT_UNROLLED_LOOP_BEGIN);
3618   movl(tmp3, 961);
3619   imull(result, tmp3);
3620   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3621   movl(tmp3, tmp2);
3622   shll(tmp3, 5);
3623   subl(tmp3, tmp2);
3624   addl(result, tmp3);
3625   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3626   addl(result, tmp3);
3627   addl(index, 2);
3628   cmpl(index, cnt1);
3629   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3630 
3631   // }
3632   // if (i >= cnt1) {
3633   bind(SHORT_UNROLLED_LOOP_EXIT);
3634   jccb(Assembler::greater, END);
3635   movl(tmp2, result);
3636   shll(result, 5);
3637   subl(result, tmp2);
3638   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3639   addl(result, tmp3);
3640   // }
3641   bind(END);
3642 
3643   BLOCK_COMMENT("} // arrays_hashcode");
3644 
3645 } // arrays_hashcode
3646 
3647 // helper function for string_compare
3648 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3649                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3650                                            Address::ScaleFactor scale2, Register index, int ae) {
3651   if (ae == StrIntrinsicNode::LL) {
3652     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3653     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3654   } else if (ae == StrIntrinsicNode::UU) {
3655     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3656     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3657   } else {
3658     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3659     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3660   }
3661 }
3662 
3663 // Compare strings, used for char[] and byte[].
3664 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3665                                        Register cnt1, Register cnt2, Register result,
3666                                        XMMRegister vec1, int ae, KRegister mask) {
3667   ShortBranchVerifier sbv(this);
3668   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3669   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3670   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3671   int stride2x2 = 0x40;
3672   Address::ScaleFactor scale = Address::no_scale;
3673   Address::ScaleFactor scale1 = Address::no_scale;
3674   Address::ScaleFactor scale2 = Address::no_scale;
3675 
3676   if (ae != StrIntrinsicNode::LL) {
3677     stride2x2 = 0x20;
3678   }
3679 
3680   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3681     shrl(cnt2, 1);
3682   }
3683   // Compute the minimum of the string lengths and the
3684   // difference of the string lengths (stack).
3685   // Do the conditional move stuff
3686   movl(result, cnt1);
3687   subl(cnt1, cnt2);
3688   push(cnt1);
3689   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3690 
3691   // Is the minimum length zero?
3692   testl(cnt2, cnt2);
3693   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3694   if (ae == StrIntrinsicNode::LL) {
3695     // Load first bytes
3696     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3697     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3698   } else if (ae == StrIntrinsicNode::UU) {
3699     // Load first characters
3700     load_unsigned_short(result, Address(str1, 0));
3701     load_unsigned_short(cnt1, Address(str2, 0));
3702   } else {
3703     load_unsigned_byte(result, Address(str1, 0));
3704     load_unsigned_short(cnt1, Address(str2, 0));
3705   }
3706   subl(result, cnt1);
3707   jcc(Assembler::notZero,  POP_LABEL);
3708 
3709   if (ae == StrIntrinsicNode::UU) {
3710     // Divide length by 2 to get number of chars
3711     shrl(cnt2, 1);
3712   }
3713   cmpl(cnt2, 1);
3714   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3715 
3716   // Check if the strings start at the same location and setup scale and stride
3717   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3718     cmpptr(str1, str2);
3719     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3720     if (ae == StrIntrinsicNode::LL) {
3721       scale = Address::times_1;
3722       stride = 16;
3723     } else {
3724       scale = Address::times_2;
3725       stride = 8;
3726     }
3727   } else {
3728     scale1 = Address::times_1;
3729     scale2 = Address::times_2;
3730     // scale not used
3731     stride = 8;
3732   }
3733 
3734   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3735     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3736     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3737     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3738     Label COMPARE_TAIL_LONG;
3739     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3740 
3741     int pcmpmask = 0x19;
3742     if (ae == StrIntrinsicNode::LL) {
3743       pcmpmask &= ~0x01;
3744     }
3745 
3746     // Setup to compare 16-chars (32-bytes) vectors,
3747     // start from first character again because it has aligned address.
3748     if (ae == StrIntrinsicNode::LL) {
3749       stride2 = 32;
3750     } else {
3751       stride2 = 16;
3752     }
3753     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3754       adr_stride = stride << scale;
3755     } else {
3756       adr_stride1 = 8;  //stride << scale1;
3757       adr_stride2 = 16; //stride << scale2;
3758     }
3759 
3760     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3761     // rax and rdx are used by pcmpestri as elements counters
3762     movl(result, cnt2);
3763     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3764     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3765 
3766     // fast path : compare first 2 8-char vectors.
3767     bind(COMPARE_16_CHARS);
3768     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3769       movdqu(vec1, Address(str1, 0));
3770     } else {
3771       pmovzxbw(vec1, Address(str1, 0));
3772     }
3773     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3774     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3775 
3776     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3777       movdqu(vec1, Address(str1, adr_stride));
3778       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3779     } else {
3780       pmovzxbw(vec1, Address(str1, adr_stride1));
3781       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3782     }
3783     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3784     addl(cnt1, stride);
3785 
3786     // Compare the characters at index in cnt1
3787     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3788     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3789     subl(result, cnt2);
3790     jmp(POP_LABEL);
3791 
3792     // Setup the registers to start vector comparison loop
3793     bind(COMPARE_WIDE_VECTORS);
3794     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3795       lea(str1, Address(str1, result, scale));
3796       lea(str2, Address(str2, result, scale));
3797     } else {
3798       lea(str1, Address(str1, result, scale1));
3799       lea(str2, Address(str2, result, scale2));
3800     }
3801     subl(result, stride2);
3802     subl(cnt2, stride2);
3803     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3804     negptr(result);
3805 
3806     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3807     bind(COMPARE_WIDE_VECTORS_LOOP);
3808 
3809 #ifdef _LP64
3810     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3811       cmpl(cnt2, stride2x2);
3812       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3813       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3814       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3815 
3816       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3817       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3818         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3819         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3820       } else {
3821         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3822         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3823       }
3824       kortestql(mask, mask);
3825       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3826       addptr(result, stride2x2);  // update since we already compared at this addr
3827       subl(cnt2, stride2x2);      // and sub the size too
3828       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3829 
3830       vpxor(vec1, vec1);
3831       jmpb(COMPARE_WIDE_TAIL);
3832     }//if (VM_Version::supports_avx512vlbw())
3833 #endif // _LP64
3834 
3835 
3836     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3837     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3838       vmovdqu(vec1, Address(str1, result, scale));
3839       vpxor(vec1, Address(str2, result, scale));
3840     } else {
3841       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3842       vpxor(vec1, Address(str2, result, scale2));
3843     }
3844     vptest(vec1, vec1);
3845     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3846     addptr(result, stride2);
3847     subl(cnt2, stride2);
3848     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3849     // clean upper bits of YMM registers
3850     vpxor(vec1, vec1);
3851 
3852     // compare wide vectors tail
3853     bind(COMPARE_WIDE_TAIL);
3854     testptr(result, result);
3855     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3856 
3857     movl(result, stride2);
3858     movl(cnt2, result);
3859     negptr(result);
3860     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3861 
3862     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3863     bind(VECTOR_NOT_EQUAL);
3864     // clean upper bits of YMM registers
3865     vpxor(vec1, vec1);
3866     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3867       lea(str1, Address(str1, result, scale));
3868       lea(str2, Address(str2, result, scale));
3869     } else {
3870       lea(str1, Address(str1, result, scale1));
3871       lea(str2, Address(str2, result, scale2));
3872     }
3873     jmp(COMPARE_16_CHARS);
3874 
3875     // Compare tail chars, length between 1 to 15 chars
3876     bind(COMPARE_TAIL_LONG);
3877     movl(cnt2, result);
3878     cmpl(cnt2, stride);
3879     jcc(Assembler::less, COMPARE_SMALL_STR);
3880 
3881     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3882       movdqu(vec1, Address(str1, 0));
3883     } else {
3884       pmovzxbw(vec1, Address(str1, 0));
3885     }
3886     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3887     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3888     subptr(cnt2, stride);
3889     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3890     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3891       lea(str1, Address(str1, result, scale));
3892       lea(str2, Address(str2, result, scale));
3893     } else {
3894       lea(str1, Address(str1, result, scale1));
3895       lea(str2, Address(str2, result, scale2));
3896     }
3897     negptr(cnt2);
3898     jmpb(WHILE_HEAD_LABEL);
3899 
3900     bind(COMPARE_SMALL_STR);
3901   } else if (UseSSE42Intrinsics) {
3902     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3903     int pcmpmask = 0x19;
3904     // Setup to compare 8-char (16-byte) vectors,
3905     // start from first character again because it has aligned address.
3906     movl(result, cnt2);
3907     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3908     if (ae == StrIntrinsicNode::LL) {
3909       pcmpmask &= ~0x01;
3910     }
3911     jcc(Assembler::zero, COMPARE_TAIL);
3912     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3913       lea(str1, Address(str1, result, scale));
3914       lea(str2, Address(str2, result, scale));
3915     } else {
3916       lea(str1, Address(str1, result, scale1));
3917       lea(str2, Address(str2, result, scale2));
3918     }
3919     negptr(result);
3920 
3921     // pcmpestri
3922     //   inputs:
3923     //     vec1- substring
3924     //     rax - negative string length (elements count)
3925     //     mem - scanned string
3926     //     rdx - string length (elements count)
3927     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3928     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3929     //   outputs:
3930     //     rcx - first mismatched element index
3931     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3932 
3933     bind(COMPARE_WIDE_VECTORS);
3934     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3935       movdqu(vec1, Address(str1, result, scale));
3936       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3937     } else {
3938       pmovzxbw(vec1, Address(str1, result, scale1));
3939       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3940     }
3941     // After pcmpestri cnt1(rcx) contains mismatched element index
3942 
3943     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3944     addptr(result, stride);
3945     subptr(cnt2, stride);
3946     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3947 
3948     // compare wide vectors tail
3949     testptr(result, result);
3950     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3951 
3952     movl(cnt2, stride);
3953     movl(result, stride);
3954     negptr(result);
3955     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3956       movdqu(vec1, Address(str1, result, scale));
3957       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3958     } else {
3959       pmovzxbw(vec1, Address(str1, result, scale1));
3960       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3961     }
3962     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3963 
3964     // Mismatched characters in the vectors
3965     bind(VECTOR_NOT_EQUAL);
3966     addptr(cnt1, result);
3967     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3968     subl(result, cnt2);
3969     jmpb(POP_LABEL);
3970 
3971     bind(COMPARE_TAIL); // limit is zero
3972     movl(cnt2, result);
3973     // Fallthru to tail compare
3974   }
3975   // Shift str2 and str1 to the end of the arrays, negate min
3976   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3977     lea(str1, Address(str1, cnt2, scale));
3978     lea(str2, Address(str2, cnt2, scale));
3979   } else {
3980     lea(str1, Address(str1, cnt2, scale1));
3981     lea(str2, Address(str2, cnt2, scale2));
3982   }
3983   decrementl(cnt2);  // first character was compared already
3984   negptr(cnt2);
3985 
3986   // Compare the rest of the elements
3987   bind(WHILE_HEAD_LABEL);
3988   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3989   subl(result, cnt1);
3990   jccb(Assembler::notZero, POP_LABEL);
3991   increment(cnt2);
3992   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3993 
3994   // Strings are equal up to min length.  Return the length difference.
3995   bind(LENGTH_DIFF_LABEL);
3996   pop(result);
3997   if (ae == StrIntrinsicNode::UU) {
3998     // Divide diff by 2 to get number of chars
3999     sarl(result, 1);
4000   }
4001   jmpb(DONE_LABEL);
4002 
4003 #ifdef _LP64
4004   if (VM_Version::supports_avx512vlbw()) {
4005 
4006     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
4007 
4008     kmovql(cnt1, mask);
4009     notq(cnt1);
4010     bsfq(cnt2, cnt1);
4011     if (ae != StrIntrinsicNode::LL) {
4012       // Divide diff by 2 to get number of chars
4013       sarl(cnt2, 1);
4014     }
4015     addq(result, cnt2);
4016     if (ae == StrIntrinsicNode::LL) {
4017       load_unsigned_byte(cnt1, Address(str2, result));
4018       load_unsigned_byte(result, Address(str1, result));
4019     } else if (ae == StrIntrinsicNode::UU) {
4020       load_unsigned_short(cnt1, Address(str2, result, scale));
4021       load_unsigned_short(result, Address(str1, result, scale));
4022     } else {
4023       load_unsigned_short(cnt1, Address(str2, result, scale2));
4024       load_unsigned_byte(result, Address(str1, result, scale1));
4025     }
4026     subl(result, cnt1);
4027     jmpb(POP_LABEL);
4028   }//if (VM_Version::supports_avx512vlbw())
4029 #endif // _LP64
4030 
4031   // Discard the stored length difference
4032   bind(POP_LABEL);
4033   pop(cnt1);
4034 
4035   // That's it
4036   bind(DONE_LABEL);
4037   if(ae == StrIntrinsicNode::UL) {
4038     negl(result);
4039   }
4040 
4041 }
4042 
4043 // Search for Non-ASCII character (Negative byte value) in a byte array,
4044 // return the index of the first such character, otherwise the length
4045 // of the array segment searched.
4046 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4047 //   @IntrinsicCandidate
4048 //   public static int countPositives(byte[] ba, int off, int len) {
4049 //     for (int i = off; i < off + len; i++) {
4050 //       if (ba[i] < 0) {
4051 //         return i - off;
4052 //       }
4053 //     }
4054 //     return len;
4055 //   }
4056 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4057   Register result, Register tmp1,
4058   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4059   // rsi: byte array
4060   // rcx: len
4061   // rax: result
4062   ShortBranchVerifier sbv(this);
4063   assert_different_registers(ary1, len, result, tmp1);
4064   assert_different_registers(vec1, vec2);
4065   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4066 
4067   movl(result, len); // copy
4068   // len == 0
4069   testl(len, len);
4070   jcc(Assembler::zero, DONE);
4071 
4072   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4073     VM_Version::supports_avx512vlbw() &&
4074     VM_Version::supports_bmi2()) {
4075 
4076     Label test_64_loop, test_tail, BREAK_LOOP;
4077     Register tmp3_aliased = len;
4078 
4079     movl(tmp1, len);
4080     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4081 
4082     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
4083     andl(len, ~(64 - 1));    // vector count (in chars)
4084     jccb(Assembler::zero, test_tail);
4085 
4086     lea(ary1, Address(ary1, len, Address::times_1));
4087     negptr(len);
4088 
4089     bind(test_64_loop);
4090     // Check whether our 64 elements of size byte contain negatives
4091     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4092     kortestql(mask1, mask1);
4093     jcc(Assembler::notZero, BREAK_LOOP);
4094 
4095     addptr(len, 64);
4096     jccb(Assembler::notZero, test_64_loop);
4097 
4098     bind(test_tail);
4099     // bail out when there is nothing to be done
4100     testl(tmp1, -1);
4101     jcc(Assembler::zero, DONE);
4102 
4103     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4104 #ifdef _LP64
4105     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4106     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4107     notq(tmp3_aliased);
4108     kmovql(mask2, tmp3_aliased);
4109 #else
4110     Label k_init;
4111     jmp(k_init);
4112 
4113     // We could not read 64-bits from a general purpose register thus we move
4114     // data required to compose 64 1's to the instruction stream
4115     // We emit 64 byte wide series of elements from 0..63 which later on would
4116     // be used as a compare targets with tail count contained in tmp1 register.
4117     // Result would be a k register having tmp1 consecutive number or 1
4118     // counting from least significant bit.
4119     address tmp = pc();
4120     emit_int64(0x0706050403020100);
4121     emit_int64(0x0F0E0D0C0B0A0908);
4122     emit_int64(0x1716151413121110);
4123     emit_int64(0x1F1E1D1C1B1A1918);
4124     emit_int64(0x2726252423222120);
4125     emit_int64(0x2F2E2D2C2B2A2928);
4126     emit_int64(0x3736353433323130);
4127     emit_int64(0x3F3E3D3C3B3A3938);
4128 
4129     bind(k_init);
4130     lea(len, InternalAddress(tmp));
4131     // create mask to test for negative byte inside a vector
4132     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
4133     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
4134 
4135 #endif
4136     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4137     ktestq(mask1, mask2);
4138     jcc(Assembler::zero, DONE);
4139 
4140     bind(BREAK_LOOP);
4141     // At least one byte in the last 64 bytes is negative.
4142     // Set up to look at the last 64 bytes as if they were a tail
4143     lea(ary1, Address(ary1, len, Address::times_1));
4144     addptr(result, len);
4145     // Ignore the very last byte: if all others are positive,
4146     // it must be negative, so we can skip right to the 2+1 byte
4147     // end comparison at this point
4148     orl(result, 63);
4149     movl(len, 63);
4150     // Fallthru to tail compare
4151   } else {
4152 
4153     if (UseAVX >= 2 && UseSSE >= 2) {
4154       // With AVX2, use 32-byte vector compare
4155       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4156 
4157       // Compare 32-byte vectors
4158       testl(len, 0xffffffe0);   // vector count (in bytes)
4159       jccb(Assembler::zero, TAIL_START);
4160 
4161       andl(len, 0xffffffe0);
4162       lea(ary1, Address(ary1, len, Address::times_1));
4163       negptr(len);
4164 
4165       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4166       movdl(vec2, tmp1);
4167       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4168 
4169       bind(COMPARE_WIDE_VECTORS);
4170       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4171       vptest(vec1, vec2);
4172       jccb(Assembler::notZero, BREAK_LOOP);
4173       addptr(len, 32);
4174       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4175 
4176       testl(result, 0x0000001f);   // any bytes remaining?
4177       jcc(Assembler::zero, DONE);
4178 
4179       // Quick test using the already prepared vector mask
4180       movl(len, result);
4181       andl(len, 0x0000001f);
4182       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4183       vptest(vec1, vec2);
4184       jcc(Assembler::zero, DONE);
4185       // There are zeros, jump to the tail to determine exactly where
4186       jmpb(TAIL_START);
4187 
4188       bind(BREAK_LOOP);
4189       // At least one byte in the last 32-byte vector is negative.
4190       // Set up to look at the last 32 bytes as if they were a tail
4191       lea(ary1, Address(ary1, len, Address::times_1));
4192       addptr(result, len);
4193       // Ignore the very last byte: if all others are positive,
4194       // it must be negative, so we can skip right to the 2+1 byte
4195       // end comparison at this point
4196       orl(result, 31);
4197       movl(len, 31);
4198       // Fallthru to tail compare
4199     } else if (UseSSE42Intrinsics) {
4200       // With SSE4.2, use double quad vector compare
4201       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4202 
4203       // Compare 16-byte vectors
4204       testl(len, 0xfffffff0);   // vector count (in bytes)
4205       jcc(Assembler::zero, TAIL_START);
4206 
4207       andl(len, 0xfffffff0);
4208       lea(ary1, Address(ary1, len, Address::times_1));
4209       negptr(len);
4210 
4211       movl(tmp1, 0x80808080);
4212       movdl(vec2, tmp1);
4213       pshufd(vec2, vec2, 0);
4214 
4215       bind(COMPARE_WIDE_VECTORS);
4216       movdqu(vec1, Address(ary1, len, Address::times_1));
4217       ptest(vec1, vec2);
4218       jccb(Assembler::notZero, BREAK_LOOP);
4219       addptr(len, 16);
4220       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4221 
4222       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4223       jcc(Assembler::zero, DONE);
4224 
4225       // Quick test using the already prepared vector mask
4226       movl(len, result);
4227       andl(len, 0x0000000f);   // tail count (in bytes)
4228       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4229       ptest(vec1, vec2);
4230       jcc(Assembler::zero, DONE);
4231       jmpb(TAIL_START);
4232 
4233       bind(BREAK_LOOP);
4234       // At least one byte in the last 16-byte vector is negative.
4235       // Set up and look at the last 16 bytes as if they were a tail
4236       lea(ary1, Address(ary1, len, Address::times_1));
4237       addptr(result, len);
4238       // Ignore the very last byte: if all others are positive,
4239       // it must be negative, so we can skip right to the 2+1 byte
4240       // end comparison at this point
4241       orl(result, 15);
4242       movl(len, 15);
4243       // Fallthru to tail compare
4244     }
4245   }
4246 
4247   bind(TAIL_START);
4248   // Compare 4-byte vectors
4249   andl(len, 0xfffffffc); // vector count (in bytes)
4250   jccb(Assembler::zero, COMPARE_CHAR);
4251 
4252   lea(ary1, Address(ary1, len, Address::times_1));
4253   negptr(len);
4254 
4255   bind(COMPARE_VECTORS);
4256   movl(tmp1, Address(ary1, len, Address::times_1));
4257   andl(tmp1, 0x80808080);
4258   jccb(Assembler::notZero, TAIL_ADJUST);
4259   addptr(len, 4);
4260   jccb(Assembler::notZero, COMPARE_VECTORS);
4261 
4262   // Compare trailing char (final 2-3 bytes), if any
4263   bind(COMPARE_CHAR);
4264 
4265   testl(result, 0x2);   // tail  char
4266   jccb(Assembler::zero, COMPARE_BYTE);
4267   load_unsigned_short(tmp1, Address(ary1, 0));
4268   andl(tmp1, 0x00008080);
4269   jccb(Assembler::notZero, CHAR_ADJUST);
4270   lea(ary1, Address(ary1, 2));
4271 
4272   bind(COMPARE_BYTE);
4273   testl(result, 0x1);   // tail  byte
4274   jccb(Assembler::zero, DONE);
4275   load_unsigned_byte(tmp1, Address(ary1, 0));
4276   testl(tmp1, 0x00000080);
4277   jccb(Assembler::zero, DONE);
4278   subptr(result, 1);
4279   jmpb(DONE);
4280 
4281   bind(TAIL_ADJUST);
4282   // there are negative bits in the last 4 byte block.
4283   // Adjust result and check the next three bytes
4284   addptr(result, len);
4285   orl(result, 3);
4286   lea(ary1, Address(ary1, len, Address::times_1));
4287   jmpb(COMPARE_CHAR);
4288 
4289   bind(CHAR_ADJUST);
4290   // We are looking at a char + optional byte tail, and found that one
4291   // of the bytes in the char is negative. Adjust the result, check the
4292   // first byte and readjust if needed.
4293   andl(result, 0xfffffffc);
4294   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4295   jccb(Assembler::notZero, DONE);
4296   addptr(result, 1);
4297 
4298   // That's it
4299   bind(DONE);
4300   if (UseAVX >= 2 && UseSSE >= 2) {
4301     // clean upper bits of YMM registers
4302     vpxor(vec1, vec1);
4303     vpxor(vec2, vec2);
4304   }
4305 }
4306 
4307 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4308 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4309                                       Register limit, Register result, Register chr,
4310                                       XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
4311   ShortBranchVerifier sbv(this);
4312   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4313 
4314   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4315   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4316 
4317   if (is_array_equ) {
4318     // Check the input args
4319     cmpoop(ary1, ary2);
4320     jcc(Assembler::equal, TRUE_LABEL);
4321 
4322     // Need additional checks for arrays_equals.
4323     testptr(ary1, ary1);
4324     jcc(Assembler::zero, FALSE_LABEL);
4325     testptr(ary2, ary2);
4326     jcc(Assembler::zero, FALSE_LABEL);
4327 
4328     // Check the lengths
4329     movl(limit, Address(ary1, length_offset));
4330     cmpl(limit, Address(ary2, length_offset));
4331     jcc(Assembler::notEqual, FALSE_LABEL);
4332   }
4333 
4334   // count == 0
4335   testl(limit, limit);
4336   jcc(Assembler::zero, TRUE_LABEL);
4337 
4338   if (is_array_equ) {
4339     // Load array address
4340     lea(ary1, Address(ary1, base_offset));
4341     lea(ary2, Address(ary2, base_offset));
4342   }
4343 
4344   if (is_array_equ && is_char) {
4345     // arrays_equals when used for char[].
4346     shll(limit, 1);      // byte count != 0
4347   }
4348   movl(result, limit); // copy
4349 
4350   if (UseAVX >= 2) {
4351     // With AVX2, use 32-byte vector compare
4352     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4353 
4354     // Compare 32-byte vectors
4355     andl(result, 0x0000001f);  //   tail count (in bytes)
4356     andl(limit, 0xffffffe0);   // vector count (in bytes)
4357     jcc(Assembler::zero, COMPARE_TAIL);
4358 
4359     lea(ary1, Address(ary1, limit, Address::times_1));
4360     lea(ary2, Address(ary2, limit, Address::times_1));
4361     negptr(limit);
4362 
4363 #ifdef _LP64
4364     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4365       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4366 
4367       cmpl(limit, -64);
4368       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4369 
4370       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4371 
4372       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4373       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4374       kortestql(mask, mask);
4375       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4376       addptr(limit, 64);  // update since we already compared at this addr
4377       cmpl(limit, -64);
4378       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4379 
4380       // At this point we may still need to compare -limit+result bytes.
4381       // We could execute the next two instruction and just continue via non-wide path:
4382       //  cmpl(limit, 0);
4383       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4384       // But since we stopped at the points ary{1,2}+limit which are
4385       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4386       // (|limit| <= 32 and result < 32),
4387       // we may just compare the last 64 bytes.
4388       //
4389       addptr(result, -64);   // it is safe, bc we just came from this area
4390       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4391       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4392       kortestql(mask, mask);
4393       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4394 
4395       jmp(TRUE_LABEL);
4396 
4397       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4398 
4399     }//if (VM_Version::supports_avx512vlbw())
4400 #endif //_LP64
4401     bind(COMPARE_WIDE_VECTORS);
4402     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
4403     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4404     vpxor(vec1, vec2);
4405 
4406     vptest(vec1, vec1);
4407     jcc(Assembler::notZero, FALSE_LABEL);
4408     addptr(limit, 32);
4409     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4410 
4411     testl(result, result);
4412     jcc(Assembler::zero, TRUE_LABEL);
4413 
4414     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
4415     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4416     vpxor(vec1, vec2);
4417 
4418     vptest(vec1, vec1);
4419     jccb(Assembler::notZero, FALSE_LABEL);
4420     jmpb(TRUE_LABEL);
4421 
4422     bind(COMPARE_TAIL); // limit is zero
4423     movl(limit, result);
4424     // Fallthru to tail compare
4425   } else if (UseSSE42Intrinsics) {
4426     // With SSE4.2, use double quad vector compare
4427     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4428 
4429     // Compare 16-byte vectors
4430     andl(result, 0x0000000f);  //   tail count (in bytes)
4431     andl(limit, 0xfffffff0);   // vector count (in bytes)
4432     jcc(Assembler::zero, COMPARE_TAIL);
4433 
4434     lea(ary1, Address(ary1, limit, Address::times_1));
4435     lea(ary2, Address(ary2, limit, Address::times_1));
4436     negptr(limit);
4437 
4438     bind(COMPARE_WIDE_VECTORS);
4439     movdqu(vec1, Address(ary1, limit, Address::times_1));
4440     movdqu(vec2, Address(ary2, limit, Address::times_1));
4441     pxor(vec1, vec2);
4442 
4443     ptest(vec1, vec1);
4444     jcc(Assembler::notZero, FALSE_LABEL);
4445     addptr(limit, 16);
4446     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4447 
4448     testl(result, result);
4449     jcc(Assembler::zero, TRUE_LABEL);
4450 
4451     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4452     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4453     pxor(vec1, vec2);
4454 
4455     ptest(vec1, vec1);
4456     jccb(Assembler::notZero, FALSE_LABEL);
4457     jmpb(TRUE_LABEL);
4458 
4459     bind(COMPARE_TAIL); // limit is zero
4460     movl(limit, result);
4461     // Fallthru to tail compare
4462   }
4463 
4464   // Compare 4-byte vectors
4465   andl(limit, 0xfffffffc); // vector count (in bytes)
4466   jccb(Assembler::zero, COMPARE_CHAR);
4467 
4468   lea(ary1, Address(ary1, limit, Address::times_1));
4469   lea(ary2, Address(ary2, limit, Address::times_1));
4470   negptr(limit);
4471 
4472   bind(COMPARE_VECTORS);
4473   movl(chr, Address(ary1, limit, Address::times_1));
4474   cmpl(chr, Address(ary2, limit, Address::times_1));
4475   jccb(Assembler::notEqual, FALSE_LABEL);
4476   addptr(limit, 4);
4477   jcc(Assembler::notZero, COMPARE_VECTORS);
4478 
4479   // Compare trailing char (final 2 bytes), if any
4480   bind(COMPARE_CHAR);
4481   testl(result, 0x2);   // tail  char
4482   jccb(Assembler::zero, COMPARE_BYTE);
4483   load_unsigned_short(chr, Address(ary1, 0));
4484   load_unsigned_short(limit, Address(ary2, 0));
4485   cmpl(chr, limit);
4486   jccb(Assembler::notEqual, FALSE_LABEL);
4487 
4488   if (is_array_equ && is_char) {
4489     bind(COMPARE_BYTE);
4490   } else {
4491     lea(ary1, Address(ary1, 2));
4492     lea(ary2, Address(ary2, 2));
4493 
4494     bind(COMPARE_BYTE);
4495     testl(result, 0x1);   // tail  byte
4496     jccb(Assembler::zero, TRUE_LABEL);
4497     load_unsigned_byte(chr, Address(ary1, 0));
4498     load_unsigned_byte(limit, Address(ary2, 0));
4499     cmpl(chr, limit);
4500     jccb(Assembler::notEqual, FALSE_LABEL);
4501   }
4502   bind(TRUE_LABEL);
4503   movl(result, 1);   // return true
4504   jmpb(DONE);
4505 
4506   bind(FALSE_LABEL);
4507   xorl(result, result); // return false
4508 
4509   // That's it
4510   bind(DONE);
4511   if (UseAVX >= 2) {
4512     // clean upper bits of YMM registers
4513     vpxor(vec1, vec1);
4514     vpxor(vec2, vec2);
4515   }
4516 }
4517 
4518 #ifdef _LP64
4519 
4520 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4521 #define __ masm.
4522   Register dst = stub.data<0>();
4523   XMMRegister src = stub.data<1>();
4524   address target = stub.data<2>();
4525   __ bind(stub.entry());
4526   __ subptr(rsp, 8);
4527   __ movdbl(Address(rsp), src);
4528   __ call(RuntimeAddress(target));
4529   __ pop(dst);
4530   __ jmp(stub.continuation());
4531 #undef __
4532 }
4533 
4534 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4535   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4536   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4537 
4538   address slowpath_target;
4539   if (dst_bt == T_INT) {
4540     if (src_bt == T_FLOAT) {
4541       cvttss2sil(dst, src);
4542       cmpl(dst, 0x80000000);
4543       slowpath_target = StubRoutines::x86::f2i_fixup();
4544     } else {
4545       cvttsd2sil(dst, src);
4546       cmpl(dst, 0x80000000);
4547       slowpath_target = StubRoutines::x86::d2i_fixup();
4548     }
4549   } else {
4550     if (src_bt == T_FLOAT) {
4551       cvttss2siq(dst, src);
4552       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4553       slowpath_target = StubRoutines::x86::f2l_fixup();
4554     } else {
4555       cvttsd2siq(dst, src);
4556       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4557       slowpath_target = StubRoutines::x86::d2l_fixup();
4558     }
4559   }
4560 
4561   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4562   jcc(Assembler::equal, stub->entry());
4563   bind(stub->continuation());
4564 }
4565 
4566 #endif // _LP64
4567 
4568 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4569                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4570   switch(ideal_opc) {
4571     case Op_LShiftVS:
4572       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4573     case Op_LShiftVI:
4574       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4575     case Op_LShiftVL:
4576       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4577     case Op_RShiftVS:
4578       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4579     case Op_RShiftVI:
4580       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4581     case Op_RShiftVL:
4582       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4583     case Op_URShiftVS:
4584       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4585     case Op_URShiftVI:
4586       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4587     case Op_URShiftVL:
4588       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4589     case Op_RotateRightV:
4590       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4591     case Op_RotateLeftV:
4592       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4593     default:
4594       fatal("Unsupported masked operation"); break;
4595   }
4596 }
4597 
4598 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4599                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4600                                     bool is_varshift) {
4601   switch (ideal_opc) {
4602     case Op_AddVB:
4603       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4604     case Op_AddVS:
4605       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4606     case Op_AddVI:
4607       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4608     case Op_AddVL:
4609       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4610     case Op_AddVF:
4611       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4612     case Op_AddVD:
4613       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4614     case Op_SubVB:
4615       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4616     case Op_SubVS:
4617       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4618     case Op_SubVI:
4619       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4620     case Op_SubVL:
4621       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4622     case Op_SubVF:
4623       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4624     case Op_SubVD:
4625       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4626     case Op_MulVS:
4627       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4628     case Op_MulVI:
4629       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4630     case Op_MulVL:
4631       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4632     case Op_MulVF:
4633       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4634     case Op_MulVD:
4635       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4636     case Op_DivVF:
4637       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4638     case Op_DivVD:
4639       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4640     case Op_SqrtVF:
4641       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4642     case Op_SqrtVD:
4643       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4644     case Op_AbsVB:
4645       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4646     case Op_AbsVS:
4647       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4648     case Op_AbsVI:
4649       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4650     case Op_AbsVL:
4651       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4652     case Op_FmaVF:
4653       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4654     case Op_FmaVD:
4655       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4656     case Op_VectorRearrange:
4657       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4658     case Op_LShiftVS:
4659       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4660     case Op_LShiftVI:
4661       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4662     case Op_LShiftVL:
4663       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4664     case Op_RShiftVS:
4665       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4666     case Op_RShiftVI:
4667       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4668     case Op_RShiftVL:
4669       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4670     case Op_URShiftVS:
4671       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4672     case Op_URShiftVI:
4673       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4674     case Op_URShiftVL:
4675       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4676     case Op_RotateLeftV:
4677       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4678     case Op_RotateRightV:
4679       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4680     case Op_MaxV:
4681       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4682     case Op_MinV:
4683       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4684     case Op_XorV:
4685       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4686     case Op_OrV:
4687       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4688     case Op_AndV:
4689       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4690     default:
4691       fatal("Unsupported masked operation"); break;
4692   }
4693 }
4694 
4695 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4696                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4697   switch (ideal_opc) {
4698     case Op_AddVB:
4699       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4700     case Op_AddVS:
4701       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4702     case Op_AddVI:
4703       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4704     case Op_AddVL:
4705       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4706     case Op_AddVF:
4707       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4708     case Op_AddVD:
4709       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4710     case Op_SubVB:
4711       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4712     case Op_SubVS:
4713       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4714     case Op_SubVI:
4715       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4716     case Op_SubVL:
4717       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4718     case Op_SubVF:
4719       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4720     case Op_SubVD:
4721       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4722     case Op_MulVS:
4723       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4724     case Op_MulVI:
4725       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4726     case Op_MulVL:
4727       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4728     case Op_MulVF:
4729       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4730     case Op_MulVD:
4731       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4732     case Op_DivVF:
4733       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4734     case Op_DivVD:
4735       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4736     case Op_FmaVF:
4737       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4738     case Op_FmaVD:
4739       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4740     case Op_MaxV:
4741       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4742     case Op_MinV:
4743       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4744     case Op_XorV:
4745       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4746     case Op_OrV:
4747       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4748     case Op_AndV:
4749       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4750     default:
4751       fatal("Unsupported masked operation"); break;
4752   }
4753 }
4754 
4755 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4756                                   KRegister src1, KRegister src2) {
4757   BasicType etype = T_ILLEGAL;
4758   switch(mask_len) {
4759     case 2:
4760     case 4:
4761     case 8:  etype = T_BYTE; break;
4762     case 16: etype = T_SHORT; break;
4763     case 32: etype = T_INT; break;
4764     case 64: etype = T_LONG; break;
4765     default: fatal("Unsupported type"); break;
4766   }
4767   assert(etype != T_ILLEGAL, "");
4768   switch(ideal_opc) {
4769     case Op_AndVMask:
4770       kand(etype, dst, src1, src2); break;
4771     case Op_OrVMask:
4772       kor(etype, dst, src1, src2); break;
4773     case Op_XorVMask:
4774       kxor(etype, dst, src1, src2); break;
4775     default:
4776       fatal("Unsupported masked operation"); break;
4777   }
4778 }
4779 
4780 /*
4781  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4782  * If src is NaN, the result is 0.
4783  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4784  * the result is equal to the value of Integer.MIN_VALUE.
4785  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4786  * the result is equal to the value of Integer.MAX_VALUE.
4787  */
4788 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4789                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4790                                                                    Register rscratch, AddressLiteral float_sign_flip,
4791                                                                    int vec_enc) {
4792   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4793   Label done;
4794   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4795   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4796   vptest(xtmp2, xtmp2, vec_enc);
4797   jccb(Assembler::equal, done);
4798 
4799   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4800   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4801 
4802   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4803   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4804   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4805 
4806   // Recompute the mask for remaining special value.
4807   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4808   // Extract SRC values corresponding to TRUE mask lanes.
4809   vpand(xtmp4, xtmp2, src, vec_enc);
4810   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4811   // values are set.
4812   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4813 
4814   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4815   bind(done);
4816 }
4817 
4818 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4819                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4820                                                                     Register rscratch, AddressLiteral float_sign_flip,
4821                                                                     int vec_enc) {
4822   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4823   Label done;
4824   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4825   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4826   kortestwl(ktmp1, ktmp1);
4827   jccb(Assembler::equal, done);
4828 
4829   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4830   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4831   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4832 
4833   kxorwl(ktmp1, ktmp1, ktmp2);
4834   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4835   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4836   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4837   bind(done);
4838 }
4839 
4840 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4841                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4842                                                                      Register rscratch, AddressLiteral double_sign_flip,
4843                                                                      int vec_enc) {
4844   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4845 
4846   Label done;
4847   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4848   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4849   kortestwl(ktmp1, ktmp1);
4850   jccb(Assembler::equal, done);
4851 
4852   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4853   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4854   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4855 
4856   kxorwl(ktmp1, ktmp1, ktmp2);
4857   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4858   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4859   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4860   bind(done);
4861 }
4862 
4863 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4864                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4865                                                                      Register rscratch, AddressLiteral float_sign_flip,
4866                                                                      int vec_enc) {
4867   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4868   Label done;
4869   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4870   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4871   kortestwl(ktmp1, ktmp1);
4872   jccb(Assembler::equal, done);
4873 
4874   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4875   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4876   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4877 
4878   kxorwl(ktmp1, ktmp1, ktmp2);
4879   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4880   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4881   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4882   bind(done);
4883 }
4884 
4885 /*
4886  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4887  * If src is NaN, the result is 0.
4888  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4889  * the result is equal to the value of Long.MIN_VALUE.
4890  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4891  * the result is equal to the value of Long.MAX_VALUE.
4892  */
4893 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4894                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4895                                                                       Register rscratch, AddressLiteral double_sign_flip,
4896                                                                       int vec_enc) {
4897   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4898 
4899   Label done;
4900   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4901   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4902   kortestwl(ktmp1, ktmp1);
4903   jccb(Assembler::equal, done);
4904 
4905   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4906   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4907   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4908 
4909   kxorwl(ktmp1, ktmp1, ktmp2);
4910   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4911   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4912   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4913   bind(done);
4914 }
4915 
4916 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
4917                                                              XMMRegister xtmp, int index, int vec_enc) {
4918    assert(vec_enc < Assembler::AVX_512bit, "");
4919    if (vec_enc == Assembler::AVX_256bit) {
4920      vextractf128_high(xtmp, src);
4921      vshufps(dst, src, xtmp, index, vec_enc);
4922    } else {
4923      vshufps(dst, src, zero, index, vec_enc);
4924    }
4925 }
4926 
4927 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4928                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
4929                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
4930   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4931 
4932   Label done;
4933   // Compare the destination lanes with float_sign_flip
4934   // value to get mask for all special values.
4935   movdqu(xtmp1, float_sign_flip, rscratch);
4936   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
4937   ptest(xtmp2, xtmp2);
4938   jccb(Assembler::equal, done);
4939 
4940   // Flip float_sign_flip to get max integer value.
4941   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
4942   pxor(xtmp1, xtmp4);
4943 
4944   // Set detination lanes corresponding to unordered source lanes as zero.
4945   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
4946   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
4947 
4948   // Shuffle mask vector and pack lower doubles word from each quadword lane.
4949   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
4950   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
4951 
4952   // Recompute the mask for remaining special value.
4953   pxor(xtmp2, xtmp3);
4954   // Extract mask corresponding to non-negative source lanes.
4955   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
4956 
4957   // Shuffle mask vector and pack lower doubles word from each quadword lane.
4958   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
4959   pand(xtmp3, xtmp2);
4960 
4961   // Replace destination lanes holding special value(0x80000000) with max int
4962   // if corresponding source lane holds a +ve value.
4963   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
4964   bind(done);
4965 }
4966 
4967 
4968 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
4969                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
4970   switch(to_elem_bt) {
4971     case T_SHORT:
4972       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
4973       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
4974       vpackusdw(dst, dst, zero, vec_enc);
4975       if (vec_enc == Assembler::AVX_256bit) {
4976         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
4977       }
4978       break;
4979     case  T_BYTE:
4980       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
4981       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
4982       vpackusdw(dst, dst, zero, vec_enc);
4983       if (vec_enc == Assembler::AVX_256bit) {
4984         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
4985       }
4986       vpackuswb(dst, dst, zero, vec_enc);
4987       break;
4988     default: assert(false, "%s", type2name(to_elem_bt));
4989   }
4990 }
4991 
4992 /*
4993  * Algorithm for vector D2L and F2I conversions:-
4994  * a) Perform vector D2L/F2I cast.
4995  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
4996  *    It signifies that source value could be any of the special floating point
4997  *    values(NaN,-Inf,Inf,Max,-Min).
4998  * c) Set destination to zero if source is NaN value.
4999  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5000  */
5001 
5002 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5003                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5004                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5005   int to_elem_sz = type2aelembytes(to_elem_bt);
5006   assert(to_elem_sz <= 4, "");
5007   vcvttps2dq(dst, src, vec_enc);
5008   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5009   if (to_elem_sz < 4) {
5010     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5011     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5012   }
5013 }
5014 
5015 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5016                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5017                                             Register rscratch, int vec_enc) {
5018   int to_elem_sz = type2aelembytes(to_elem_bt);
5019   assert(to_elem_sz <= 4, "");
5020   vcvttps2dq(dst, src, vec_enc);
5021   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5022   switch(to_elem_bt) {
5023     case T_INT:
5024       break;
5025     case T_SHORT:
5026       evpmovdw(dst, dst, vec_enc);
5027       break;
5028     case T_BYTE:
5029       evpmovdb(dst, dst, vec_enc);
5030       break;
5031     default: assert(false, "%s", type2name(to_elem_bt));
5032   }
5033 }
5034 
5035 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5036                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5037                                             Register rscratch, int vec_enc) {
5038   evcvttps2qq(dst, src, vec_enc);
5039   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5040 }
5041 
5042 // Handling for downcasting from double to integer or sub-word types on AVX2.
5043 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5044                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5045                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5046   int to_elem_sz = type2aelembytes(to_elem_bt);
5047   assert(to_elem_sz < 8, "");
5048   vcvttpd2dq(dst, src, vec_enc);
5049   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5050                                               float_sign_flip, vec_enc);
5051   if (to_elem_sz < 4) {
5052     // xtmp4 holds all zero lanes.
5053     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5054   }
5055 }
5056 
5057 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5058                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5059                                             KRegister ktmp2, AddressLiteral sign_flip,
5060                                             Register rscratch, int vec_enc) {
5061   if (VM_Version::supports_avx512dq()) {
5062     evcvttpd2qq(dst, src, vec_enc);
5063     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5064     switch(to_elem_bt) {
5065       case T_LONG:
5066         break;
5067       case T_INT:
5068         evpmovsqd(dst, dst, vec_enc);
5069         break;
5070       case T_SHORT:
5071         evpmovsqd(dst, dst, vec_enc);
5072         evpmovdw(dst, dst, vec_enc);
5073         break;
5074       case T_BYTE:
5075         evpmovsqd(dst, dst, vec_enc);
5076         evpmovdb(dst, dst, vec_enc);
5077         break;
5078       default: assert(false, "%s", type2name(to_elem_bt));
5079     }
5080   } else {
5081     assert(type2aelembytes(to_elem_bt) <= 4, "");
5082     vcvttpd2dq(dst, src, vec_enc);
5083     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5084     switch(to_elem_bt) {
5085       case T_INT:
5086         break;
5087       case T_SHORT:
5088         evpmovdw(dst, dst, vec_enc);
5089         break;
5090       case T_BYTE:
5091         evpmovdb(dst, dst, vec_enc);
5092         break;
5093       default: assert(false, "%s", type2name(to_elem_bt));
5094     }
5095   }
5096 }
5097 
5098 #ifdef _LP64
5099 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5100                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5101                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5102   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5103   // and re-instantiate original MXCSR.RC mode after that.
5104   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5105 
5106   mov64(tmp, julong_cast(0.5L));
5107   evpbroadcastq(xtmp1, tmp, vec_enc);
5108   vaddpd(xtmp1, src , xtmp1, vec_enc);
5109   evcvtpd2qq(dst, xtmp1, vec_enc);
5110   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5111                                                 double_sign_flip, vec_enc);;
5112 
5113   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5114 }
5115 
5116 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5117                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5118                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5119   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5120   // and re-instantiate original MXCSR.RC mode after that.
5121   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5122 
5123   movl(tmp, jint_cast(0.5));
5124   movq(xtmp1, tmp);
5125   vbroadcastss(xtmp1, xtmp1, vec_enc);
5126   vaddps(xtmp1, src , xtmp1, vec_enc);
5127   vcvtps2dq(dst, xtmp1, vec_enc);
5128   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5129                                               float_sign_flip, vec_enc);
5130 
5131   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5132 }
5133 
5134 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5135                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5136                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5137   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5138   // and re-instantiate original MXCSR.RC mode after that.
5139   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5140 
5141   movl(tmp, jint_cast(0.5));
5142   movq(xtmp1, tmp);
5143   vbroadcastss(xtmp1, xtmp1, vec_enc);
5144   vaddps(xtmp1, src , xtmp1, vec_enc);
5145   vcvtps2dq(dst, xtmp1, vec_enc);
5146   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5147 
5148   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5149 }
5150 #endif // _LP64
5151 
5152 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5153                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5154   switch (from_elem_bt) {
5155     case T_BYTE:
5156       switch (to_elem_bt) {
5157         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5158         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5159         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5160         default: ShouldNotReachHere();
5161       }
5162       break;
5163     case T_SHORT:
5164       switch (to_elem_bt) {
5165         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5166         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5167         default: ShouldNotReachHere();
5168       }
5169       break;
5170     case T_INT:
5171       assert(to_elem_bt == T_LONG, "");
5172       vpmovzxdq(dst, src, vlen_enc);
5173       break;
5174     default:
5175       ShouldNotReachHere();
5176   }
5177 }
5178 
5179 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5180                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5181   switch (from_elem_bt) {
5182     case T_BYTE:
5183       switch (to_elem_bt) {
5184         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5185         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5186         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5187         default: ShouldNotReachHere();
5188       }
5189       break;
5190     case T_SHORT:
5191       switch (to_elem_bt) {
5192         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5193         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5194         default: ShouldNotReachHere();
5195       }
5196       break;
5197     case T_INT:
5198       assert(to_elem_bt == T_LONG, "");
5199       vpmovsxdq(dst, src, vlen_enc);
5200       break;
5201     default:
5202       ShouldNotReachHere();
5203   }
5204 }
5205 
5206 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5207                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5208   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5209   assert(vlen_enc != AVX_512bit, "");
5210 
5211   int dst_bt_size = type2aelembytes(dst_bt);
5212   int src_bt_size = type2aelembytes(src_bt);
5213   if (dst_bt_size > src_bt_size) {
5214     switch (dst_bt_size / src_bt_size) {
5215       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5216       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5217       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5218       default: ShouldNotReachHere();
5219     }
5220   } else {
5221     assert(dst_bt_size < src_bt_size, "");
5222     switch (src_bt_size / dst_bt_size) {
5223       case 2: {
5224         if (vlen_enc == AVX_128bit) {
5225           vpacksswb(dst, src, src, vlen_enc);
5226         } else {
5227           vpacksswb(dst, src, src, vlen_enc);
5228           vpermq(dst, dst, 0x08, vlen_enc);
5229         }
5230         break;
5231       }
5232       case 4: {
5233         if (vlen_enc == AVX_128bit) {
5234           vpackssdw(dst, src, src, vlen_enc);
5235           vpacksswb(dst, dst, dst, vlen_enc);
5236         } else {
5237           vpackssdw(dst, src, src, vlen_enc);
5238           vpermq(dst, dst, 0x08, vlen_enc);
5239           vpacksswb(dst, dst, dst, AVX_128bit);
5240         }
5241         break;
5242       }
5243       case 8: {
5244         if (vlen_enc == AVX_128bit) {
5245           vpshufd(dst, src, 0x08, vlen_enc);
5246           vpackssdw(dst, dst, dst, vlen_enc);
5247           vpacksswb(dst, dst, dst, vlen_enc);
5248         } else {
5249           vpshufd(dst, src, 0x08, vlen_enc);
5250           vpermq(dst, dst, 0x08, vlen_enc);
5251           vpackssdw(dst, dst, dst, AVX_128bit);
5252           vpacksswb(dst, dst, dst, AVX_128bit);
5253         }
5254         break;
5255       }
5256       default: ShouldNotReachHere();
5257     }
5258   }
5259 }
5260 
5261 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5262                                    bool merge, BasicType bt, int vlen_enc) {
5263   if (bt == T_INT) {
5264     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5265   } else {
5266     assert(bt == T_LONG, "");
5267     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5268   }
5269 }
5270 
5271 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5272                                    bool merge, BasicType bt, int vlen_enc) {
5273   if (bt == T_INT) {
5274     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5275   } else {
5276     assert(bt == T_LONG, "");
5277     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5278   }
5279 }
5280 
5281 #ifdef _LP64
5282 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5283                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5284                                                int vec_enc) {
5285   int index = 0;
5286   int vindex = 0;
5287   mov64(rtmp1, 0x0101010101010101L);
5288   pdepq(rtmp1, src, rtmp1);
5289   if (mask_len > 8) {
5290     movq(rtmp2, src);
5291     vpxor(xtmp, xtmp, xtmp, vec_enc);
5292     movq(xtmp, rtmp1);
5293   }
5294   movq(dst, rtmp1);
5295 
5296   mask_len -= 8;
5297   while (mask_len > 0) {
5298     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5299     index++;
5300     if ((index % 2) == 0) {
5301       pxor(xtmp, xtmp);
5302     }
5303     mov64(rtmp1, 0x0101010101010101L);
5304     shrq(rtmp2, 8);
5305     pdepq(rtmp1, rtmp2, rtmp1);
5306     pinsrq(xtmp, rtmp1, index % 2);
5307     vindex = index / 2;
5308     if (vindex) {
5309       // Write entire 16 byte vector when both 64 bit
5310       // lanes are update to save redundant instructions.
5311       if (index % 2) {
5312         vinsertf128(dst, dst, xtmp, vindex);
5313       }
5314     } else {
5315       vmovdqu(dst, xtmp);
5316     }
5317     mask_len -= 8;
5318   }
5319 }
5320 
5321 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5322   switch(opc) {
5323     case Op_VectorMaskTrueCount:
5324       popcntq(dst, tmp);
5325       break;
5326     case Op_VectorMaskLastTrue:
5327       if (VM_Version::supports_lzcnt()) {
5328         lzcntq(tmp, tmp);
5329         movl(dst, 63);
5330         subl(dst, tmp);
5331       } else {
5332         movl(dst, -1);
5333         bsrq(tmp, tmp);
5334         cmov32(Assembler::notZero, dst, tmp);
5335       }
5336       break;
5337     case Op_VectorMaskFirstTrue:
5338       if (VM_Version::supports_bmi1()) {
5339         if (masklen < 32) {
5340           orl(tmp, 1 << masklen);
5341           tzcntl(dst, tmp);
5342         } else if (masklen == 32) {
5343           tzcntl(dst, tmp);
5344         } else {
5345           assert(masklen == 64, "");
5346           tzcntq(dst, tmp);
5347         }
5348       } else {
5349         if (masklen < 32) {
5350           orl(tmp, 1 << masklen);
5351           bsfl(dst, tmp);
5352         } else {
5353           assert(masklen == 32 || masklen == 64, "");
5354           movl(dst, masklen);
5355           if (masklen == 32)  {
5356             bsfl(tmp, tmp);
5357           } else {
5358             bsfq(tmp, tmp);
5359           }
5360           cmov32(Assembler::notZero, dst, tmp);
5361         }
5362       }
5363       break;
5364     case Op_VectorMaskToLong:
5365       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5366       break;
5367     default: assert(false, "Unhandled mask operation");
5368   }
5369 }
5370 
5371 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5372                                               int masklen, int masksize, int vec_enc) {
5373   assert(VM_Version::supports_popcnt(), "");
5374 
5375   if(VM_Version::supports_avx512bw()) {
5376     kmovql(tmp, mask);
5377   } else {
5378     assert(masklen <= 16, "");
5379     kmovwl(tmp, mask);
5380   }
5381 
5382   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5383   // operations needs to be clipped.
5384   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5385     andq(tmp, (1 << masklen) - 1);
5386   }
5387 
5388   vector_mask_operation_helper(opc, dst, tmp, masklen);
5389 }
5390 
5391 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5392                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5393   assert(vec_enc == AVX_128bit && VM_Version::supports_avx() ||
5394          vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4), "");
5395   assert(VM_Version::supports_popcnt(), "");
5396 
5397   bool need_clip = false;
5398   switch(bt) {
5399     case T_BOOLEAN:
5400       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5401       vpxor(xtmp, xtmp, xtmp, vec_enc);
5402       vpsubb(xtmp, xtmp, mask, vec_enc);
5403       vpmovmskb(tmp, xtmp, vec_enc);
5404       need_clip = masklen < 16;
5405       break;
5406     case T_BYTE:
5407       vpmovmskb(tmp, mask, vec_enc);
5408       need_clip = masklen < 16;
5409       break;
5410     case T_SHORT:
5411       vpacksswb(xtmp, mask, mask, vec_enc);
5412       if (masklen >= 16) {
5413         vpermpd(xtmp, xtmp, 8, vec_enc);
5414       }
5415       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5416       need_clip = masklen < 16;
5417       break;
5418     case T_INT:
5419     case T_FLOAT:
5420       vmovmskps(tmp, mask, vec_enc);
5421       need_clip = masklen < 4;
5422       break;
5423     case T_LONG:
5424     case T_DOUBLE:
5425       vmovmskpd(tmp, mask, vec_enc);
5426       need_clip = masklen < 2;
5427       break;
5428     default: assert(false, "Unhandled type, %s", type2name(bt));
5429   }
5430 
5431   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5432   // operations needs to be clipped.
5433   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5434     // need_clip implies masklen < 32
5435     andq(tmp, (1 << masklen) - 1);
5436   }
5437 
5438   vector_mask_operation_helper(opc, dst, tmp, masklen);
5439 }
5440 
5441 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5442                                              Register rtmp2, int mask_len) {
5443   kmov(rtmp1, src);
5444   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5445   mov64(rtmp2, -1L);
5446   pextq(rtmp2, rtmp2, rtmp1);
5447   kmov(dst, rtmp2);
5448 }
5449 
5450 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5451                                                bool merge, BasicType bt, int vec_enc) {
5452   if (opcode == Op_CompressV) {
5453     switch(bt) {
5454     case T_BYTE:
5455       evpcompressb(dst, mask, src, merge, vec_enc);
5456       break;
5457     case T_CHAR:
5458     case T_SHORT:
5459       evpcompressw(dst, mask, src, merge, vec_enc);
5460       break;
5461     case T_INT:
5462       evpcompressd(dst, mask, src, merge, vec_enc);
5463       break;
5464     case T_FLOAT:
5465       evcompressps(dst, mask, src, merge, vec_enc);
5466       break;
5467     case T_LONG:
5468       evpcompressq(dst, mask, src, merge, vec_enc);
5469       break;
5470     case T_DOUBLE:
5471       evcompresspd(dst, mask, src, merge, vec_enc);
5472       break;
5473     default:
5474       fatal("Unsupported type %s", type2name(bt));
5475       break;
5476     }
5477   } else {
5478     assert(opcode == Op_ExpandV, "");
5479     switch(bt) {
5480     case T_BYTE:
5481       evpexpandb(dst, mask, src, merge, vec_enc);
5482       break;
5483     case T_CHAR:
5484     case T_SHORT:
5485       evpexpandw(dst, mask, src, merge, vec_enc);
5486       break;
5487     case T_INT:
5488       evpexpandd(dst, mask, src, merge, vec_enc);
5489       break;
5490     case T_FLOAT:
5491       evexpandps(dst, mask, src, merge, vec_enc);
5492       break;
5493     case T_LONG:
5494       evpexpandq(dst, mask, src, merge, vec_enc);
5495       break;
5496     case T_DOUBLE:
5497       evexpandpd(dst, mask, src, merge, vec_enc);
5498       break;
5499     default:
5500       fatal("Unsupported type %s", type2name(bt));
5501       break;
5502     }
5503   }
5504 }
5505 #endif
5506 
5507 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5508                                            KRegister ktmp1, int vec_enc) {
5509   if (opcode == Op_SignumVD) {
5510     vsubpd(dst, zero, one, vec_enc);
5511     // if src < 0 ? -1 : 1
5512     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5513     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5514     // if src == NaN, -0.0 or 0.0 return src.
5515     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5516     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5517   } else {
5518     assert(opcode == Op_SignumVF, "");
5519     vsubps(dst, zero, one, vec_enc);
5520     // if src < 0 ? -1 : 1
5521     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5522     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5523     // if src == NaN, -0.0 or 0.0 return src.
5524     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5525     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5526   }
5527 }
5528 
5529 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5530                                           XMMRegister xtmp1, int vec_enc) {
5531   if (opcode == Op_SignumVD) {
5532     vsubpd(dst, zero, one, vec_enc);
5533     // if src < 0 ? -1 : 1
5534     vblendvpd(dst, one, dst, src, vec_enc);
5535     // if src == NaN, -0.0 or 0.0 return src.
5536     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5537     vblendvpd(dst, dst, src, xtmp1, vec_enc);
5538   } else {
5539     assert(opcode == Op_SignumVF, "");
5540     vsubps(dst, zero, one, vec_enc);
5541     // if src < 0 ? -1 : 1
5542     vblendvps(dst, one, dst, src, vec_enc);
5543     // if src == NaN, -0.0 or 0.0 return src.
5544     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5545     vblendvps(dst, dst, src, xtmp1, vec_enc);
5546   }
5547 }
5548 
5549 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5550   if (VM_Version::supports_avx512bw()) {
5551     if (mask_len > 32) {
5552       kmovql(dst, src);
5553     } else {
5554       kmovdl(dst, src);
5555       if (mask_len != 32) {
5556         kshiftrdl(dst, dst, 32 - mask_len);
5557       }
5558     }
5559   } else {
5560     assert(mask_len <= 16, "");
5561     kmovwl(dst, src);
5562     if (mask_len != 16) {
5563       kshiftrwl(dst, dst, 16 - mask_len);
5564     }
5565   }
5566 }
5567 
5568 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5569   int lane_size = type2aelembytes(bt);
5570   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5571   if ((is_LP64 || lane_size < 8) &&
5572       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5573        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5574     movptr(rtmp, imm32);
5575     switch(lane_size) {
5576       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5577       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5578       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5579       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5580       fatal("Unsupported lane size %d", lane_size);
5581       break;
5582     }
5583   } else {
5584     movptr(rtmp, imm32);
5585     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5586     switch(lane_size) {
5587       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5588       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5589       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5590       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5591       fatal("Unsupported lane size %d", lane_size);
5592       break;
5593     }
5594   }
5595 }
5596 
5597 //
5598 // Following is lookup table based popcount computation algorithm:-
5599 //       Index   Bit set count
5600 //     [ 0000 ->   0,
5601 //       0001 ->   1,
5602 //       0010 ->   1,
5603 //       0011 ->   2,
5604 //       0100 ->   1,
5605 //       0101 ->   2,
5606 //       0110 ->   2,
5607 //       0111 ->   3,
5608 //       1000 ->   1,
5609 //       1001 ->   2,
5610 //       1010 ->   3,
5611 //       1011 ->   3,
5612 //       1100 ->   2,
5613 //       1101 ->   3,
5614 //       1111 ->   4 ]
5615 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5616 //     shuffle indices for lookup table access.
5617 //  b. Right shift each byte of vector lane by 4 positions.
5618 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5619 //     shuffle indices for lookup table access.
5620 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5621 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5622 //     count of all the bytes of a quadword.
5623 //  f. Perform step e. for upper 128bit vector lane.
5624 //  g. Pack the bitset count of quadwords back to double word.
5625 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5626 
5627 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5628                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5629   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5630   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5631   vpsrlw(dst, src, 4, vec_enc);
5632   vpand(dst, dst, xtmp1, vec_enc);
5633   vpand(xtmp1, src, xtmp1, vec_enc);
5634   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5635   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5636   vpshufb(dst, xtmp2, dst, vec_enc);
5637   vpaddb(dst, dst, xtmp1, vec_enc);
5638 }
5639 
5640 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5641                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5642   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5643   // Following code is as per steps e,f,g and h of above algorithm.
5644   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5645   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5646   vpsadbw(dst, dst, xtmp2, vec_enc);
5647   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5648   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5649   vpackuswb(dst, xtmp1, dst, vec_enc);
5650 }
5651 
5652 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5653                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5654   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5655   // Add the popcount of upper and lower bytes of word.
5656   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5657   vpsrlw(dst, xtmp1, 8, vec_enc);
5658   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5659   vpaddw(dst, dst, xtmp1, vec_enc);
5660 }
5661 
5662 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5663                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5664   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5665   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5666   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5667 }
5668 
5669 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5670                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5671   switch(bt) {
5672     case T_LONG:
5673       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5674       break;
5675     case T_INT:
5676       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5677       break;
5678     case T_CHAR:
5679     case T_SHORT:
5680       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5681       break;
5682     case T_BYTE:
5683     case T_BOOLEAN:
5684       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5685       break;
5686     default:
5687       fatal("Unsupported type %s", type2name(bt));
5688       break;
5689   }
5690 }
5691 
5692 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5693                                                       KRegister mask, bool merge, int vec_enc) {
5694   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5695   switch(bt) {
5696     case T_LONG:
5697       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5698       evpopcntq(dst, mask, src, merge, vec_enc);
5699       break;
5700     case T_INT:
5701       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5702       evpopcntd(dst, mask, src, merge, vec_enc);
5703       break;
5704     case T_CHAR:
5705     case T_SHORT:
5706       assert(VM_Version::supports_avx512_bitalg(), "");
5707       evpopcntw(dst, mask, src, merge, vec_enc);
5708       break;
5709     case T_BYTE:
5710     case T_BOOLEAN:
5711       assert(VM_Version::supports_avx512_bitalg(), "");
5712       evpopcntb(dst, mask, src, merge, vec_enc);
5713       break;
5714     default:
5715       fatal("Unsupported type %s", type2name(bt));
5716       break;
5717   }
5718 }
5719 
5720 #ifndef _LP64
5721 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
5722   assert(VM_Version::supports_avx512bw(), "");
5723   kmovdl(tmp, src);
5724   kunpckdql(dst, tmp, tmp);
5725 }
5726 #endif
5727 
5728 // Bit reversal algorithm first reverses the bits of each byte followed by
5729 // a byte level reversal for multi-byte primitive types (short/int/long).
5730 // Algorithm performs a lookup table access to get reverse bit sequence
5731 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5732 // is obtained by swapping the reverse bit sequences of upper and lower
5733 // nibble of a byte.
5734 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5735                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5736   if (VM_Version::supports_avx512vlbw()) {
5737 
5738     // Get the reverse bit sequence of lower nibble of each byte.
5739     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5740     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5741     evpandq(dst, xtmp2, src, vec_enc);
5742     vpshufb(dst, xtmp1, dst, vec_enc);
5743     vpsllq(dst, dst, 4, vec_enc);
5744 
5745     // Get the reverse bit sequence of upper nibble of each byte.
5746     vpandn(xtmp2, xtmp2, src, vec_enc);
5747     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5748     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5749 
5750     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5751     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5752     evporq(xtmp2, dst, xtmp2, vec_enc);
5753     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5754 
5755   } else if(vec_enc == Assembler::AVX_512bit) {
5756     // Shift based bit reversal.
5757     assert(bt == T_LONG || bt == T_INT, "");
5758 
5759     // Swap lower and upper nibble of each byte.
5760     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5761 
5762     // Swap two least and most significant bits of each nibble.
5763     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5764 
5765     // Swap adjacent pair of bits.
5766     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5767     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5768 
5769     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5770     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5771   } else {
5772     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5773     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5774 
5775     // Get the reverse bit sequence of lower nibble of each byte.
5776     vpand(dst, xtmp2, src, vec_enc);
5777     vpshufb(dst, xtmp1, dst, vec_enc);
5778     vpsllq(dst, dst, 4, vec_enc);
5779 
5780     // Get the reverse bit sequence of upper nibble of each byte.
5781     vpandn(xtmp2, xtmp2, src, vec_enc);
5782     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5783     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5784 
5785     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5786     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5787     vpor(xtmp2, dst, xtmp2, vec_enc);
5788     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5789   }
5790 }
5791 
5792 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5793                                                 XMMRegister xtmp, Register rscratch) {
5794   assert(VM_Version::supports_gfni(), "");
5795   assert(rscratch != noreg || always_reachable(mask), "missing");
5796 
5797   // Galois field instruction based bit reversal based on following algorithm.
5798   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5799   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5800   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5801   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5802 }
5803 
5804 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5805                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5806   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5807   evpandq(dst, xtmp1, src, vec_enc);
5808   vpsllq(dst, dst, nbits, vec_enc);
5809   vpandn(xtmp1, xtmp1, src, vec_enc);
5810   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5811   evporq(dst, dst, xtmp1, vec_enc);
5812 }
5813 
5814 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5815                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5816   // Shift based bit reversal.
5817   assert(VM_Version::supports_evex(), "");
5818   switch(bt) {
5819     case T_LONG:
5820       // Swap upper and lower double word of each quad word.
5821       evprorq(xtmp1, k0, src, 32, true, vec_enc);
5822       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5823       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5824       break;
5825     case T_INT:
5826       // Swap upper and lower word of each double word.
5827       evprord(xtmp1, k0, src, 16, true, vec_enc);
5828       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5829       break;
5830     case T_CHAR:
5831     case T_SHORT:
5832       // Swap upper and lower byte of each word.
5833       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
5834       break;
5835     case T_BYTE:
5836       evmovdquq(dst, k0, src, true, vec_enc);
5837       break;
5838     default:
5839       fatal("Unsupported type %s", type2name(bt));
5840       break;
5841   }
5842 }
5843 
5844 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5845   if (bt == T_BYTE) {
5846     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
5847       evmovdquq(dst, k0, src, true, vec_enc);
5848     } else {
5849       vmovdqu(dst, src);
5850     }
5851     return;
5852   }
5853   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
5854   // pre-computed shuffle indices.
5855   switch(bt) {
5856     case T_LONG:
5857       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
5858       break;
5859     case T_INT:
5860       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
5861       break;
5862     case T_CHAR:
5863     case T_SHORT:
5864       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
5865       break;
5866     default:
5867       fatal("Unsupported type %s", type2name(bt));
5868       break;
5869   }
5870   vpshufb(dst, src, dst, vec_enc);
5871 }
5872 
5873 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5874                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5875                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
5876   assert(is_integral_type(bt), "");
5877   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5878   assert(VM_Version::supports_avx512cd(), "");
5879   switch(bt) {
5880     case T_LONG:
5881       evplzcntq(dst, ktmp, src, merge, vec_enc);
5882       break;
5883     case T_INT:
5884       evplzcntd(dst, ktmp, src, merge, vec_enc);
5885       break;
5886     case T_SHORT:
5887       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
5888       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
5889       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
5890       vpunpckhwd(dst, xtmp1, src, vec_enc);
5891       evplzcntd(dst, ktmp, dst, merge, vec_enc);
5892       vpackusdw(dst, xtmp2, dst, vec_enc);
5893       break;
5894     case T_BYTE:
5895       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5896       // accessing the lookup table.
5897       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5898       // accessing the lookup table.
5899       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5900       assert(VM_Version::supports_avx512bw(), "");
5901       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
5902       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
5903       vpand(xtmp2, dst, src, vec_enc);
5904       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5905       vpsrlw(xtmp3, src, 4, vec_enc);
5906       vpand(xtmp3, dst, xtmp3, vec_enc);
5907       vpshufb(dst, xtmp1, xtmp3, vec_enc);
5908       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5909       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
5910       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
5911       break;
5912     default:
5913       fatal("Unsupported type %s", type2name(bt));
5914       break;
5915   }
5916 }
5917 
5918 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5919                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5920   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
5921   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5922   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5923   // accessing the lookup table.
5924   vpand(dst, xtmp2, src, vec_enc);
5925   vpshufb(dst, xtmp1, dst, vec_enc);
5926   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5927   // accessing the lookup table.
5928   vpsrlw(xtmp3, src, 4, vec_enc);
5929   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
5930   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
5931   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5932   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5933   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
5934   vpaddb(dst, dst, xtmp2, vec_enc);
5935   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
5936 }
5937 
5938 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5939                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5940   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5941   // Add zero counts of lower byte and upper byte of a word if
5942   // upper byte holds a zero value.
5943   vpsrlw(xtmp3, src, 8, vec_enc);
5944   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
5945   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
5946   vpsllw(xtmp2, dst, 8, vec_enc);
5947   vpaddw(xtmp2, xtmp2, dst, vec_enc);
5948   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5949   vpsrlw(dst, dst, 8, vec_enc);
5950 }
5951 
5952 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5953                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
5954   // Since IEEE 754 floating point format represents mantissa in 1.0 format
5955   // hence biased exponent can be used to compute leading zero count as per
5956   // following formula:-
5957   // LZCNT = 32 - (biased_exp - 127)
5958   // Special handling has been introduced for Zero, Max_Int and -ve source values.
5959 
5960   // Broadcast 0xFF
5961   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
5962   vpsrld(xtmp1, xtmp1, 24, vec_enc);
5963 
5964   // Extract biased exponent.
5965   vcvtdq2ps(dst, src, vec_enc);
5966   vpsrld(dst, dst, 23, vec_enc);
5967   vpand(dst, dst, xtmp1, vec_enc);
5968 
5969   // Broadcast 127.
5970   vpsrld(xtmp1, xtmp1, 1, vec_enc);
5971   // Exponent = biased_exp - 127
5972   vpsubd(dst, dst, xtmp1, vec_enc);
5973 
5974   // Exponent = Exponent  + 1
5975   vpsrld(xtmp3, xtmp1, 6, vec_enc);
5976   vpaddd(dst, dst, xtmp3, vec_enc);
5977 
5978   // Replace -ve exponent with zero, exponent is -ve when src
5979   // lane contains a zero value.
5980   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5981   vblendvps(dst, dst, xtmp2, dst, vec_enc);
5982 
5983   // Rematerialize broadcast 32.
5984   vpslld(xtmp1, xtmp3, 5, vec_enc);
5985   // Exponent is 32 if corresponding source lane contains max_int value.
5986   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
5987   // LZCNT = 32 - exponent
5988   vpsubd(dst, xtmp1, dst, vec_enc);
5989 
5990   // Replace LZCNT with a value 1 if corresponding source lane
5991   // contains max_int value.
5992   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
5993 
5994   // Replace biased_exp with 0 if source lane value is less than zero.
5995   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5996   vblendvps(dst, dst, xtmp2, src, vec_enc);
5997 }
5998 
5999 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6000                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6001   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6002   // Add zero counts of lower word and upper word of a double word if
6003   // upper word holds a zero value.
6004   vpsrld(xtmp3, src, 16, vec_enc);
6005   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6006   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6007   vpslld(xtmp2, dst, 16, vec_enc);
6008   vpaddd(xtmp2, xtmp2, dst, vec_enc);
6009   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6010   vpsrld(dst, dst, 16, vec_enc);
6011   // Add zero counts of lower doubleword and upper doubleword of a
6012   // quadword if upper doubleword holds a zero value.
6013   vpsrlq(xtmp3, src, 32, vec_enc);
6014   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6015   vpsllq(xtmp2, dst, 32, vec_enc);
6016   vpaddq(xtmp2, xtmp2, dst, vec_enc);
6017   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6018   vpsrlq(dst, dst, 32, vec_enc);
6019 }
6020 
6021 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6022                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6023                                                        Register rtmp, int vec_enc) {
6024   assert(is_integral_type(bt), "unexpected type");
6025   assert(vec_enc < Assembler::AVX_512bit, "");
6026   switch(bt) {
6027     case T_LONG:
6028       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6029       break;
6030     case T_INT:
6031       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6032       break;
6033     case T_SHORT:
6034       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6035       break;
6036     case T_BYTE:
6037       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6038       break;
6039     default:
6040       fatal("Unsupported type %s", type2name(bt));
6041       break;
6042   }
6043 }
6044 
6045 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6046   switch(bt) {
6047     case T_BYTE:
6048       vpsubb(dst, src1, src2, vec_enc);
6049       break;
6050     case T_SHORT:
6051       vpsubw(dst, src1, src2, vec_enc);
6052       break;
6053     case T_INT:
6054       vpsubd(dst, src1, src2, vec_enc);
6055       break;
6056     case T_LONG:
6057       vpsubq(dst, src1, src2, vec_enc);
6058       break;
6059     default:
6060       fatal("Unsupported type %s", type2name(bt));
6061       break;
6062   }
6063 }
6064 
6065 // Trailing zero count computation is based on leading zero count operation as per
6066 // following equation. All AVX3 targets support AVX512CD feature which offers
6067 // direct vector instruction to compute leading zero count.
6068 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6069 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6070                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6071                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6072   assert(is_integral_type(bt), "");
6073   // xtmp = -1
6074   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6075   // xtmp = xtmp + src
6076   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6077   // xtmp = xtmp & ~src
6078   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6079   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6080   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6081   vpsub(bt, dst, xtmp4, dst, vec_enc);
6082 }
6083 
6084 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6085 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6086 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6087                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6088   assert(is_integral_type(bt), "");
6089   // xtmp = 0
6090   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6091   // xtmp = 0 - src
6092   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6093   // xtmp = xtmp | src
6094   vpor(xtmp3, xtmp3, src, vec_enc);
6095   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6096   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6097   vpsub(bt, dst, xtmp1, dst, vec_enc);
6098 }
6099 
6100 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6101   Label done;
6102   Label neg_divisor_fastpath;
6103   cmpl(divisor, 0);
6104   jccb(Assembler::less, neg_divisor_fastpath);
6105   xorl(rdx, rdx);
6106   divl(divisor);
6107   jmpb(done);
6108   bind(neg_divisor_fastpath);
6109   // Fastpath for divisor < 0:
6110   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6111   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6112   movl(rdx, rax);
6113   subl(rdx, divisor);
6114   if (VM_Version::supports_bmi1()) {
6115     andnl(rax, rdx, rax);
6116   } else {
6117     notl(rdx);
6118     andl(rax, rdx);
6119   }
6120   shrl(rax, 31);
6121   bind(done);
6122 }
6123 
6124 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6125   Label done;
6126   Label neg_divisor_fastpath;
6127   cmpl(divisor, 0);
6128   jccb(Assembler::less, neg_divisor_fastpath);
6129   xorl(rdx, rdx);
6130   divl(divisor);
6131   jmpb(done);
6132   bind(neg_divisor_fastpath);
6133   // Fastpath when divisor < 0:
6134   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6135   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6136   movl(rdx, rax);
6137   subl(rax, divisor);
6138   if (VM_Version::supports_bmi1()) {
6139     andnl(rax, rax, rdx);
6140   } else {
6141     notl(rax);
6142     andl(rax, rdx);
6143   }
6144   sarl(rax, 31);
6145   andl(rax, divisor);
6146   subl(rdx, rax);
6147   bind(done);
6148 }
6149 
6150 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6151   Label done;
6152   Label neg_divisor_fastpath;
6153 
6154   cmpl(divisor, 0);
6155   jccb(Assembler::less, neg_divisor_fastpath);
6156   xorl(rdx, rdx);
6157   divl(divisor);
6158   jmpb(done);
6159   bind(neg_divisor_fastpath);
6160   // Fastpath for divisor < 0:
6161   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6162   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6163   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6164   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6165   movl(rdx, rax);
6166   subl(rax, divisor);
6167   if (VM_Version::supports_bmi1()) {
6168     andnl(rax, rax, rdx);
6169   } else {
6170     notl(rax);
6171     andl(rax, rdx);
6172   }
6173   movl(tmp, rax);
6174   shrl(rax, 31); // quotient
6175   sarl(tmp, 31);
6176   andl(tmp, divisor);
6177   subl(rdx, tmp); // remainder
6178   bind(done);
6179 }
6180 
6181 #ifdef _LP64
6182 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6183                                  XMMRegister xtmp2, Register rtmp) {
6184   if(VM_Version::supports_gfni()) {
6185     // Galois field instruction based bit reversal based on following algorithm.
6186     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6187     mov64(rtmp, 0x8040201008040201L);
6188     movq(xtmp1, src);
6189     movq(xtmp2, rtmp);
6190     gf2p8affineqb(xtmp1, xtmp2, 0);
6191     movq(dst, xtmp1);
6192   } else {
6193     // Swap even and odd numbered bits.
6194     movl(rtmp, src);
6195     andl(rtmp, 0x55555555);
6196     shll(rtmp, 1);
6197     movl(dst, src);
6198     andl(dst, 0xAAAAAAAA);
6199     shrl(dst, 1);
6200     orl(dst, rtmp);
6201 
6202     // Swap LSB and MSB 2 bits of each nibble.
6203     movl(rtmp, dst);
6204     andl(rtmp, 0x33333333);
6205     shll(rtmp, 2);
6206     andl(dst, 0xCCCCCCCC);
6207     shrl(dst, 2);
6208     orl(dst, rtmp);
6209 
6210     // Swap LSB and MSB 4 bits of each byte.
6211     movl(rtmp, dst);
6212     andl(rtmp, 0x0F0F0F0F);
6213     shll(rtmp, 4);
6214     andl(dst, 0xF0F0F0F0);
6215     shrl(dst, 4);
6216     orl(dst, rtmp);
6217   }
6218   bswapl(dst);
6219 }
6220 
6221 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6222                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6223   if(VM_Version::supports_gfni()) {
6224     // Galois field instruction based bit reversal based on following algorithm.
6225     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6226     mov64(rtmp1, 0x8040201008040201L);
6227     movq(xtmp1, src);
6228     movq(xtmp2, rtmp1);
6229     gf2p8affineqb(xtmp1, xtmp2, 0);
6230     movq(dst, xtmp1);
6231   } else {
6232     // Swap even and odd numbered bits.
6233     movq(rtmp1, src);
6234     mov64(rtmp2, 0x5555555555555555L);
6235     andq(rtmp1, rtmp2);
6236     shlq(rtmp1, 1);
6237     movq(dst, src);
6238     notq(rtmp2);
6239     andq(dst, rtmp2);
6240     shrq(dst, 1);
6241     orq(dst, rtmp1);
6242 
6243     // Swap LSB and MSB 2 bits of each nibble.
6244     movq(rtmp1, dst);
6245     mov64(rtmp2, 0x3333333333333333L);
6246     andq(rtmp1, rtmp2);
6247     shlq(rtmp1, 2);
6248     notq(rtmp2);
6249     andq(dst, rtmp2);
6250     shrq(dst, 2);
6251     orq(dst, rtmp1);
6252 
6253     // Swap LSB and MSB 4 bits of each byte.
6254     movq(rtmp1, dst);
6255     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6256     andq(rtmp1, rtmp2);
6257     shlq(rtmp1, 4);
6258     notq(rtmp2);
6259     andq(dst, rtmp2);
6260     shrq(dst, 4);
6261     orq(dst, rtmp1);
6262   }
6263   bswapq(dst);
6264 }
6265 
6266 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6267   Label done;
6268   Label neg_divisor_fastpath;
6269   cmpq(divisor, 0);
6270   jccb(Assembler::less, neg_divisor_fastpath);
6271   xorl(rdx, rdx);
6272   divq(divisor);
6273   jmpb(done);
6274   bind(neg_divisor_fastpath);
6275   // Fastpath for divisor < 0:
6276   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6277   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6278   movq(rdx, rax);
6279   subq(rdx, divisor);
6280   if (VM_Version::supports_bmi1()) {
6281     andnq(rax, rdx, rax);
6282   } else {
6283     notq(rdx);
6284     andq(rax, rdx);
6285   }
6286   shrq(rax, 63);
6287   bind(done);
6288 }
6289 
6290 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6291   Label done;
6292   Label neg_divisor_fastpath;
6293   cmpq(divisor, 0);
6294   jccb(Assembler::less, neg_divisor_fastpath);
6295   xorq(rdx, rdx);
6296   divq(divisor);
6297   jmp(done);
6298   bind(neg_divisor_fastpath);
6299   // Fastpath when divisor < 0:
6300   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6301   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6302   movq(rdx, rax);
6303   subq(rax, divisor);
6304   if (VM_Version::supports_bmi1()) {
6305     andnq(rax, rax, rdx);
6306   } else {
6307     notq(rax);
6308     andq(rax, rdx);
6309   }
6310   sarq(rax, 63);
6311   andq(rax, divisor);
6312   subq(rdx, rax);
6313   bind(done);
6314 }
6315 
6316 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6317   Label done;
6318   Label neg_divisor_fastpath;
6319   cmpq(divisor, 0);
6320   jccb(Assembler::less, neg_divisor_fastpath);
6321   xorq(rdx, rdx);
6322   divq(divisor);
6323   jmp(done);
6324   bind(neg_divisor_fastpath);
6325   // Fastpath for divisor < 0:
6326   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6327   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6328   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6329   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6330   movq(rdx, rax);
6331   subq(rax, divisor);
6332   if (VM_Version::supports_bmi1()) {
6333     andnq(rax, rax, rdx);
6334   } else {
6335     notq(rax);
6336     andq(rax, rdx);
6337   }
6338   movq(tmp, rax);
6339   shrq(rax, 63); // quotient
6340   sarq(tmp, 63);
6341   andq(tmp, divisor);
6342   subq(rdx, tmp); // remainder
6343   bind(done);
6344 }
6345 #endif
6346 
6347 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6348                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6349                                         int vlen_enc) {
6350   assert(VM_Version::supports_avx512bw(), "");
6351   // Byte shuffles are inlane operations and indices are determined using
6352   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6353   // normalized to index range 0-15. This makes sure that all the multiples
6354   // of an index value are placed at same relative position in 128 bit
6355   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6356   // will be 16th element in their respective 128 bit lanes.
6357   movl(rtmp, 16);
6358   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6359 
6360   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6361   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6362   // original shuffle indices and move the shuffled lanes corresponding to true
6363   // mask to destination vector.
6364   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6365   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6366   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6367 
6368   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6369   // and broadcasting second 128 bit lane.
6370   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6371   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6372   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6373   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6374   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6375 
6376   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6377   // and broadcasting third 128 bit lane.
6378   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6379   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6380   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6381   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6382   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6383 
6384   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6385   // and broadcasting third 128 bit lane.
6386   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6387   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6388   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6389   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6390   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6391 }
6392 
6393 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6394                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6395   if (vlen_enc == AVX_128bit) {
6396     vpermilps(dst, src, shuffle, vlen_enc);
6397   } else if (bt == T_INT) {
6398     vpermd(dst, shuffle, src, vlen_enc);
6399   } else {
6400     assert(bt == T_FLOAT, "");
6401     vpermps(dst, shuffle, src, vlen_enc);
6402   }
6403 }
6404 
6405 #ifdef _LP64
6406 void C2_MacroAssembler::load_nklass_compact_c2(Register dst, Register obj, Register index, Address::ScaleFactor scale, int disp) {
6407   C2LoadNKlassStub* stub = new (Compile::current()->comp_arena()) C2LoadNKlassStub(dst);
6408   Compile::current()->output()->add_stub(stub);
6409 
6410   // Note: Don't clobber obj anywhere in that method!
6411 
6412   // The incoming address is pointing into obj-start + klass_offset_in_bytes. We need to extract
6413   // obj-start, so that we can load from the object's mark-word instead. Usually the address
6414   // comes as obj-start in obj and klass_offset_in_bytes in disp. However, sometimes C2
6415   // emits code that pre-computes obj-start + klass_offset_in_bytes into a register, and
6416   // then passes that register as obj and 0 in disp. The following code extracts the base
6417   // and offset to load the mark-word.
6418   int offset = oopDesc::mark_offset_in_bytes() + disp - oopDesc::klass_offset_in_bytes();
6419   movq(dst, Address(obj, index, scale, offset));
6420   testb(dst, markWord::monitor_value);
6421   jcc(Assembler::notZero, stub->entry());
6422   bind(stub->continuation());
6423   shrq(dst, markWord::klass_shift);
6424 }
6425 #endif