1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/globals.hpp"
  37 #include "runtime/objectMonitor.hpp"
  38 #include "runtime/stubRoutines.hpp"
  39 #include "utilities/checkedCast.hpp"
  40 #include "utilities/globalDefinitions.hpp"
  41 #include "utilities/powerOfTwo.hpp"
  42 #include "utilities/sizes.hpp"
  43 
  44 #ifdef PRODUCT
  45 #define BLOCK_COMMENT(str) /* nothing */
  46 #define STOP(error) stop(error)
  47 #else
  48 #define BLOCK_COMMENT(str) block_comment(str)
  49 #define STOP(error) block_comment(error); stop(error)
  50 #endif
  51 
  52 // C2 compiled method's prolog code.
  53 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  54 
  55   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  56   // NativeJump::patch_verified_entry will be able to patch out the entry
  57   // code safely. The push to verify stack depth is ok at 5 bytes,
  58   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  59   // stack bang then we must use the 6 byte frame allocation even if
  60   // we have no frame. :-(
  61   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  62 
  63   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  64   // Remove word for return addr
  65   framesize -= wordSize;
  66   stack_bang_size -= wordSize;
  67 
  68   // Calls to C2R adapters often do not accept exceptional returns.
  69   // We require that their callers must bang for them.  But be careful, because
  70   // some VM calls (such as call site linkage) can use several kilobytes of
  71   // stack.  But the stack safety zone should account for that.
  72   // See bugs 4446381, 4468289, 4497237.
  73   if (stack_bang_size > 0) {
  74     generate_stack_overflow_check(stack_bang_size);
  75 
  76     // We always push rbp, so that on return to interpreter rbp, will be
  77     // restored correctly and we can correct the stack.
  78     push(rbp);
  79     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  80     if (PreserveFramePointer) {
  81       mov(rbp, rsp);
  82     }
  83     // Remove word for ebp
  84     framesize -= wordSize;
  85 
  86     // Create frame
  87     if (framesize) {
  88       subptr(rsp, framesize);
  89     }
  90   } else {
  91     // Create frame (force generation of a 4 byte immediate value)
  92     subptr_imm32(rsp, framesize);
  93 
  94     // Save RBP register now.
  95     framesize -= wordSize;
  96     movptr(Address(rsp, framesize), rbp);
  97     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  98     if (PreserveFramePointer) {
  99       movptr(rbp, rsp);
 100       if (framesize > 0) {
 101         addptr(rbp, framesize);
 102       }
 103     }
 104   }
 105 
 106   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 107     framesize -= wordSize;
 108     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 109   }
 110 
 111 #ifndef _LP64
 112   // If method sets FPU control word do it now
 113   if (fp_mode_24b) {
 114     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 115   }
 116   if (UseSSE >= 2 && VerifyFPU) {
 117     verify_FPU(0, "FPU stack must be clean on entry");
 118   }
 119 #endif
 120 
 121 #ifdef ASSERT
 122   if (VerifyStackAtCalls) {
 123     Label L;
 124     push(rax);
 125     mov(rax, rsp);
 126     andptr(rax, StackAlignmentInBytes-1);
 127     cmpptr(rax, StackAlignmentInBytes-wordSize);
 128     pop(rax);
 129     jcc(Assembler::equal, L);
 130     STOP("Stack is not properly aligned!");
 131     bind(L);
 132   }
 133 #endif
 134 
 135   if (!is_stub) {
 136     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 137  #ifdef _LP64
 138     if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
 139       // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 140       Label dummy_slow_path;
 141       Label dummy_continuation;
 142       Label* slow_path = &dummy_slow_path;
 143       Label* continuation = &dummy_continuation;
 144       if (!Compile::current()->output()->in_scratch_emit_size()) {
 145         // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 146         C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 147         Compile::current()->output()->add_stub(stub);
 148         slow_path = &stub->entry();
 149         continuation = &stub->continuation();
 150       }
 151       bs->nmethod_entry_barrier(this, slow_path, continuation);
 152     }
 153 #else
 154     // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 155     bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */);
 156 #endif
 157   }
 158 }
 159 
 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 161   switch (vlen_in_bytes) {
 162     case  4: // fall-through
 163     case  8: // fall-through
 164     case 16: return Assembler::AVX_128bit;
 165     case 32: return Assembler::AVX_256bit;
 166     case 64: return Assembler::AVX_512bit;
 167 
 168     default: {
 169       ShouldNotReachHere();
 170       return Assembler::AVX_NoVec;
 171     }
 172   }
 173 }
 174 
 175 #if INCLUDE_RTM_OPT
 176 
 177 // Update rtm_counters based on abort status
 178 // input: abort_status
 179 //        rtm_counters (RTMLockingCounters*)
 180 // flags are killed
 181 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
 182 
 183   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
 184   if (PrintPreciseRTMLockingStatistics) {
 185     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
 186       Label check_abort;
 187       testl(abort_status, (1<<i));
 188       jccb(Assembler::equal, check_abort);
 189       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
 190       bind(check_abort);
 191     }
 192   }
 193 }
 194 
 195 // Branch if (random & (count-1) != 0), count is 2^n
 196 // tmp, scr and flags are killed
 197 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
 198   assert(tmp == rax, "");
 199   assert(scr == rdx, "");
 200   rdtsc(); // modifies EDX:EAX
 201   andptr(tmp, count-1);
 202   jccb(Assembler::notZero, brLabel);
 203 }
 204 
 205 // Perform abort ratio calculation, set no_rtm bit if high ratio
 206 // input:  rtm_counters_Reg (RTMLockingCounters* address)
 207 // tmpReg, rtm_counters_Reg and flags are killed
 208 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
 209                                                     Register rtm_counters_Reg,
 210                                                     RTMLockingCounters* rtm_counters,
 211                                                     Metadata* method_data) {
 212   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 213 
 214   if (RTMLockingCalculationDelay > 0) {
 215     // Delay calculation
 216     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()));
 217     testptr(tmpReg, tmpReg);
 218     jccb(Assembler::equal, L_done);
 219   }
 220   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 221   //   Aborted transactions = abort_count * 100
 222   //   All transactions = total_count *  RTMTotalCountIncrRate
 223   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 224 
 225   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 226   cmpptr(tmpReg, RTMAbortThreshold);
 227   jccb(Assembler::below, L_check_always_rtm2);
 228   imulptr(tmpReg, tmpReg, 100);
 229 
 230   Register scrReg = rtm_counters_Reg;
 231   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 232   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 233   imulptr(scrReg, scrReg, RTMAbortRatio);
 234   cmpptr(tmpReg, scrReg);
 235   jccb(Assembler::below, L_check_always_rtm1);
 236   if (method_data != nullptr) {
 237     // set rtm_state to "no rtm" in MDO
 238     mov_metadata(tmpReg, method_data);
 239     lock();
 240     orl(Address(tmpReg, MethodData::rtm_state_offset()), NoRTM);
 241   }
 242   jmpb(L_done);
 243   bind(L_check_always_rtm1);
 244   // Reload RTMLockingCounters* address
 245   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 246   bind(L_check_always_rtm2);
 247   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 248   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 249   jccb(Assembler::below, L_done);
 250   if (method_data != nullptr) {
 251     // set rtm_state to "always rtm" in MDO
 252     mov_metadata(tmpReg, method_data);
 253     lock();
 254     orl(Address(tmpReg, MethodData::rtm_state_offset()), UseRTM);
 255   }
 256   bind(L_done);
 257 }
 258 
 259 // Update counters and perform abort ratio calculation
 260 // input:  abort_status_Reg
 261 // rtm_counters_Reg, flags are killed
 262 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 263                                       Register rtm_counters_Reg,
 264                                       RTMLockingCounters* rtm_counters,
 265                                       Metadata* method_data,
 266                                       bool profile_rtm) {
 267 
 268   assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 269   // update rtm counters based on rax value at abort
 270   // reads abort_status_Reg, updates flags
 271   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 272   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 273   if (profile_rtm) {
 274     // Save abort status because abort_status_Reg is used by following code.
 275     if (RTMRetryCount > 0) {
 276       push(abort_status_Reg);
 277     }
 278     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 279     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 280     // restore abort status
 281     if (RTMRetryCount > 0) {
 282       pop(abort_status_Reg);
 283     }
 284   }
 285 }
 286 
 287 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 288 // inputs: retry_count_Reg
 289 //       : abort_status_Reg
 290 // output: retry_count_Reg decremented by 1
 291 // flags are killed
 292 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 293   Label doneRetry;
 294   assert(abort_status_Reg == rax, "");
 295   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 296   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 297   // if reason is in 0x6 and retry count != 0 then retry
 298   andptr(abort_status_Reg, 0x6);
 299   jccb(Assembler::zero, doneRetry);
 300   testl(retry_count_Reg, retry_count_Reg);
 301   jccb(Assembler::zero, doneRetry);
 302   pause();
 303   decrementl(retry_count_Reg);
 304   jmp(retryLabel);
 305   bind(doneRetry);
 306 }
 307 
 308 // Spin and retry if lock is busy,
 309 // inputs: box_Reg (monitor address)
 310 //       : retry_count_Reg
 311 // output: retry_count_Reg decremented by 1
 312 //       : clear z flag if retry count exceeded
 313 // tmp_Reg, scr_Reg, flags are killed
 314 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 315                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 316   Label SpinLoop, SpinExit, doneRetry;
 317   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 318 
 319   testl(retry_count_Reg, retry_count_Reg);
 320   jccb(Assembler::zero, doneRetry);
 321   decrementl(retry_count_Reg);
 322   movptr(scr_Reg, RTMSpinLoopCount);
 323 
 324   bind(SpinLoop);
 325   pause();
 326   decrementl(scr_Reg);
 327   jccb(Assembler::lessEqual, SpinExit);
 328   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 329   testptr(tmp_Reg, tmp_Reg);
 330   jccb(Assembler::notZero, SpinLoop);
 331 
 332   bind(SpinExit);
 333   jmp(retryLabel);
 334   bind(doneRetry);
 335   incrementl(retry_count_Reg); // clear z flag
 336 }
 337 
 338 // Use RTM for normal stack locks
 339 // Input: objReg (object to lock)
 340 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 341                                          Register retry_on_abort_count_Reg,
 342                                          RTMLockingCounters* stack_rtm_counters,
 343                                          Metadata* method_data, bool profile_rtm,
 344                                          Label& DONE_LABEL, Label& IsInflated) {
 345   assert(UseRTMForStackLocks, "why call this otherwise?");
 346   assert(tmpReg == rax, "");
 347   assert(scrReg == rdx, "");
 348   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 349 
 350   if (RTMRetryCount > 0) {
 351     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 352     bind(L_rtm_retry);
 353   }
 354   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 355   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral
 356   jcc(Assembler::notZero, IsInflated);
 357 
 358   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 359     Label L_noincrement;
 360     if (RTMTotalCountIncrRate > 1) {
 361       // tmpReg, scrReg and flags are killed
 362       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 363     }
 364     assert(stack_rtm_counters != nullptr, "should not be null when profiling RTM");
 365     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 366     bind(L_noincrement);
 367   }
 368   xbegin(L_on_abort);
 369   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 370   andptr(tmpReg, markWord::lock_mask_in_place);     // look at 2 lock bits
 371   cmpptr(tmpReg, markWord::unlocked_value);         // bits = 01 unlocked
 372   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 373 
 374   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 375   if (UseRTMXendForLockBusy) {
 376     xend();
 377     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 378     jmp(L_decrement_retry);
 379   }
 380   else {
 381     xabort(0);
 382   }
 383   bind(L_on_abort);
 384   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 385     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 386   }
 387   bind(L_decrement_retry);
 388   if (RTMRetryCount > 0) {
 389     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 390     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 391   }
 392 }
 393 
 394 // Use RTM for inflating locks
 395 // inputs: objReg (object to lock)
 396 //         boxReg (on-stack box address (displaced header location) - KILLED)
 397 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 398 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 399                                             Register scrReg, Register retry_on_busy_count_Reg,
 400                                             Register retry_on_abort_count_Reg,
 401                                             RTMLockingCounters* rtm_counters,
 402                                             Metadata* method_data, bool profile_rtm,
 403                                             Label& DONE_LABEL) {
 404   assert(UseRTMLocking, "why call this otherwise?");
 405   assert(tmpReg == rax, "");
 406   assert(scrReg == rdx, "");
 407   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 408   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 409 
 410   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 411   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 412 
 413   if (RTMRetryCount > 0) {
 414     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 415     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 416     bind(L_rtm_retry);
 417   }
 418   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 419     Label L_noincrement;
 420     if (RTMTotalCountIncrRate > 1) {
 421       // tmpReg, scrReg and flags are killed
 422       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 423     }
 424     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 425     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 426     bind(L_noincrement);
 427   }
 428   xbegin(L_on_abort);
 429   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 430   movptr(tmpReg, Address(tmpReg, owner_offset));
 431   testptr(tmpReg, tmpReg);
 432   jcc(Assembler::zero, DONE_LABEL);
 433   if (UseRTMXendForLockBusy) {
 434     xend();
 435     jmp(L_decrement_retry);
 436   }
 437   else {
 438     xabort(0);
 439   }
 440   bind(L_on_abort);
 441   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 442   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 443     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 444   }
 445   if (RTMRetryCount > 0) {
 446     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 447     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 448   }
 449 
 450   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 451   testptr(tmpReg, tmpReg) ;
 452   jccb(Assembler::notZero, L_decrement_retry) ;
 453 
 454   // Appears unlocked - try to swing _owner from null to non-null.
 455   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 456 #ifdef _LP64
 457   Register threadReg = r15_thread;
 458 #else
 459   get_thread(scrReg);
 460   Register threadReg = scrReg;
 461 #endif
 462   lock();
 463   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 464 
 465   if (RTMRetryCount > 0) {
 466     // success done else retry
 467     jccb(Assembler::equal, DONE_LABEL) ;
 468     bind(L_decrement_retry);
 469     // Spin and retry if lock is busy.
 470     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 471   }
 472   else {
 473     bind(L_decrement_retry);
 474   }
 475 }
 476 
 477 #endif //  INCLUDE_RTM_OPT
 478 
 479 // fast_lock and fast_unlock used by C2
 480 
 481 // Because the transitions from emitted code to the runtime
 482 // monitorenter/exit helper stubs are so slow it's critical that
 483 // we inline both the stack-locking fast path and the inflated fast path.
 484 //
 485 // See also: cmpFastLock and cmpFastUnlock.
 486 //
 487 // What follows is a specialized inline transliteration of the code
 488 // in enter() and exit(). If we're concerned about I$ bloat another
 489 // option would be to emit TrySlowEnter and TrySlowExit methods
 490 // at startup-time.  These methods would accept arguments as
 491 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 492 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 493 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 494 // In practice, however, the # of lock sites is bounded and is usually small.
 495 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 496 // if the processor uses simple bimodal branch predictors keyed by EIP
 497 // Since the helper routines would be called from multiple synchronization
 498 // sites.
 499 //
 500 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 501 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 502 // to those specialized methods.  That'd give us a mostly platform-independent
 503 // implementation that the JITs could optimize and inline at their pleasure.
 504 // Done correctly, the only time we'd need to cross to native could would be
 505 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 506 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 507 // (b) explicit barriers or fence operations.
 508 //
 509 // TODO:
 510 //
 511 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 512 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 513 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 514 //    the lock operators would typically be faster than reifying Self.
 515 //
 516 // *  Ideally I'd define the primitives as:
 517 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 518 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 519 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 520 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 521 //    Furthermore the register assignments are overconstrained, possibly resulting in
 522 //    sub-optimal code near the synchronization site.
 523 //
 524 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 525 //    Alternately, use a better sp-proximity test.
 526 //
 527 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 528 //    Either one is sufficient to uniquely identify a thread.
 529 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 530 //
 531 // *  Intrinsify notify() and notifyAll() for the common cases where the
 532 //    object is locked by the calling thread but the waitlist is empty.
 533 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 534 //
 535 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 536 //    But beware of excessive branch density on AMD Opterons.
 537 //
 538 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 539 //    or failure of the fast path.  If the fast path fails then we pass
 540 //    control to the slow path, typically in C.  In fast_lock and
 541 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 542 //    will emit a conditional branch immediately after the node.
 543 //    So we have branches to branches and lots of ICC.ZF games.
 544 //    Instead, it might be better to have C2 pass a "FailureLabel"
 545 //    into fast_lock and fast_unlock.  In the case of success, control
 546 //    will drop through the node.  ICC.ZF is undefined at exit.
 547 //    In the case of failure, the node will branch directly to the
 548 //    FailureLabel
 549 
 550 
 551 // obj: object to lock
 552 // box: on-stack box address (displaced header location) - KILLED
 553 // rax,: tmp -- KILLED
 554 // scr: tmp -- KILLED
 555 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 556                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 557                                  RTMLockingCounters* rtm_counters,
 558                                  RTMLockingCounters* stack_rtm_counters,
 559                                  Metadata* method_data,
 560                                  bool use_rtm, bool profile_rtm) {
 561   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 562   // Ensure the register assignments are disjoint
 563   assert(tmpReg == rax, "");
 564 
 565   if (use_rtm) {
 566     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 567   } else {
 568     assert(cx1Reg == noreg, "");
 569     assert(cx2Reg == noreg, "");
 570     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 571   }
 572 
 573   // Possible cases that we'll encounter in fast_lock
 574   // ------------------------------------------------
 575   // * Inflated
 576   //    -- unlocked
 577   //    -- Locked
 578   //       = by self
 579   //       = by other
 580   // * neutral
 581   // * stack-locked
 582   //    -- by self
 583   //       = sp-proximity test hits
 584   //       = sp-proximity test generates false-negative
 585   //    -- by other
 586   //
 587 
 588   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 589 
 590   if (DiagnoseSyncOnValueBasedClasses != 0) {
 591     load_klass(tmpReg, objReg, scrReg);
 592     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 593     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 594     jcc(Assembler::notZero, DONE_LABEL);
 595   }
 596 
 597 #if INCLUDE_RTM_OPT
 598   if (UseRTMForStackLocks && use_rtm) {
 599     assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive");
 600     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 601                       stack_rtm_counters, method_data, profile_rtm,
 602                       DONE_LABEL, IsInflated);
 603   }
 604 #endif // INCLUDE_RTM_OPT
 605 
 606   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 607   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 608   jcc(Assembler::notZero, IsInflated);
 609 
 610   if (LockingMode == LM_MONITOR) {
 611     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 612     testptr(objReg, objReg);
 613   } else {
 614     assert(LockingMode == LM_LEGACY, "must be");
 615     // Attempt stack-locking ...
 616     orptr (tmpReg, markWord::unlocked_value);
 617     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 618     lock();
 619     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 620     jcc(Assembler::equal, COUNT);           // Success
 621 
 622     // Recursive locking.
 623     // The object is stack-locked: markword contains stack pointer to BasicLock.
 624     // Locked by current thread if difference with current SP is less than one page.
 625     subptr(tmpReg, rsp);
 626     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 627     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) );
 628     movptr(Address(boxReg, 0), tmpReg);
 629   }
 630   jmp(DONE_LABEL);
 631 
 632   bind(IsInflated);
 633   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 634 
 635 #if INCLUDE_RTM_OPT
 636   // Use the same RTM locking code in 32- and 64-bit VM.
 637   if (use_rtm) {
 638     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 639                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 640   } else {
 641 #endif // INCLUDE_RTM_OPT
 642 
 643 #ifndef _LP64
 644   // The object is inflated.
 645 
 646   // boxReg refers to the on-stack BasicLock in the current frame.
 647   // We'd like to write:
 648   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 649   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 650   // additional latency as we have another ST in the store buffer that must drain.
 651 
 652   // avoid ST-before-CAS
 653   // register juggle because we need tmpReg for cmpxchgptr below
 654   movptr(scrReg, boxReg);
 655   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 656 
 657   // Optimistic form: consider XORL tmpReg,tmpReg
 658   movptr(tmpReg, NULL_WORD);
 659 
 660   // Appears unlocked - try to swing _owner from null to non-null.
 661   // Ideally, I'd manifest "Self" with get_thread and then attempt
 662   // to CAS the register containing Self into m->Owner.
 663   // But we don't have enough registers, so instead we can either try to CAS
 664   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 665   // we later store "Self" into m->Owner.  Transiently storing a stack address
 666   // (rsp or the address of the box) into  m->owner is harmless.
 667   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 668   lock();
 669   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 670   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 671   // If we weren't able to swing _owner from null to the BasicLock
 672   // then take the slow path.
 673   jccb  (Assembler::notZero, NO_COUNT);
 674   // update _owner from BasicLock to thread
 675   get_thread (scrReg);                    // beware: clobbers ICCs
 676   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 677   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 678 
 679   // If the CAS fails we can either retry or pass control to the slow path.
 680   // We use the latter tactic.
 681   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 682   // If the CAS was successful ...
 683   //   Self has acquired the lock
 684   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 685   // Intentional fall-through into DONE_LABEL ...
 686 #else // _LP64
 687   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 688   movq(scrReg, tmpReg);
 689   xorq(tmpReg, tmpReg);
 690   lock();
 691   cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 692   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 693   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 694   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 695   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 696   jccb(Assembler::equal, COUNT);          // CAS above succeeded; propagate ZF = 1 (success)
 697 
 698   cmpptr(thread, rax);                // Check if we are already the owner (recursive lock)
 699   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 700   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 701   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 702 #endif // _LP64
 703 #if INCLUDE_RTM_OPT
 704   } // use_rtm()
 705 #endif
 706   bind(DONE_LABEL);
 707 
 708   // ZFlag == 1 count in fast path
 709   // ZFlag == 0 count in slow path
 710   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 711 
 712   bind(COUNT);
 713   // Count monitors in fast path
 714   increment(Address(thread, JavaThread::held_monitor_count_offset()));
 715 
 716   xorl(tmpReg, tmpReg); // Set ZF == 1
 717 
 718   bind(NO_COUNT);
 719 
 720   // At NO_COUNT the icc ZFlag is set as follows ...
 721   // fast_unlock uses the same protocol.
 722   // ZFlag == 1 -> Success
 723   // ZFlag == 0 -> Failure - force control through the slow path
 724 }
 725 
 726 // obj: object to unlock
 727 // box: box address (displaced header location), killed.  Must be EAX.
 728 // tmp: killed, cannot be obj nor box.
 729 //
 730 // Some commentary on balanced locking:
 731 //
 732 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 733 // Methods that don't have provably balanced locking are forced to run in the
 734 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 735 // The interpreter provides two properties:
 736 // I1:  At return-time the interpreter automatically and quietly unlocks any
 737 //      objects acquired the current activation (frame).  Recall that the
 738 //      interpreter maintains an on-stack list of locks currently held by
 739 //      a frame.
 740 // I2:  If a method attempts to unlock an object that is not held by the
 741 //      the frame the interpreter throws IMSX.
 742 //
 743 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 744 // B() doesn't have provably balanced locking so it runs in the interpreter.
 745 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 746 // is still locked by A().
 747 //
 748 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 749 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 750 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 751 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 752 // Arguably given that the spec legislates the JNI case as undefined our implementation
 753 // could reasonably *avoid* checking owner in fast_unlock().
 754 // In the interest of performance we elide m->Owner==Self check in unlock.
 755 // A perfectly viable alternative is to elide the owner check except when
 756 // Xcheck:jni is enabled.
 757 
 758 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 759   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 760   assert(boxReg == rax, "");
 761   assert_different_registers(objReg, boxReg, tmpReg);
 762 
 763   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 764 
 765 #if INCLUDE_RTM_OPT
 766   if (UseRTMForStackLocks && use_rtm) {
 767     assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive");
 768     Label L_regular_unlock;
 769     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 770     andptr(tmpReg, markWord::lock_mask_in_place);                     // look at 2 lock bits
 771     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 01 unlocked
 772     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 773     xend();                                                           // otherwise end...
 774     jmp(DONE_LABEL);                                                  // ... and we're done
 775     bind(L_regular_unlock);
 776   }
 777 #endif
 778 
 779   if (LockingMode == LM_LEGACY) {
 780     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 781     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 782   }
 783   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 784   if (LockingMode != LM_MONITOR) {
 785     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 786     jcc(Assembler::zero, Stacked);
 787   }
 788 
 789   // It's inflated.
 790 
 791 #if INCLUDE_RTM_OPT
 792   if (use_rtm) {
 793     Label L_regular_inflated_unlock;
 794     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 795     movptr(boxReg, Address(tmpReg, owner_offset));
 796     testptr(boxReg, boxReg);
 797     jccb(Assembler::notZero, L_regular_inflated_unlock);
 798     xend();
 799     jmp(DONE_LABEL);
 800     bind(L_regular_inflated_unlock);
 801   }
 802 #endif
 803 
 804   // Despite our balanced locking property we still check that m->_owner == Self
 805   // as java routines or native JNI code called by this thread might
 806   // have released the lock.
 807   // Refer to the comments in synchronizer.cpp for how we might encode extra
 808   // state in _succ so we can avoid fetching EntryList|cxq.
 809   //
 810   // If there's no contention try a 1-0 exit.  That is, exit without
 811   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 812   // we detect and recover from the race that the 1-0 exit admits.
 813   //
 814   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 815   // before it STs null into _owner, releasing the lock.  Updates
 816   // to data protected by the critical section must be visible before
 817   // we drop the lock (and thus before any other thread could acquire
 818   // the lock and observe the fields protected by the lock).
 819   // IA32's memory-model is SPO, so STs are ordered with respect to
 820   // each other and there's no need for an explicit barrier (fence).
 821   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 822 #ifndef _LP64
 823   // Note that we could employ various encoding schemes to reduce
 824   // the number of loads below (currently 4) to just 2 or 3.
 825   // Refer to the comments in synchronizer.cpp.
 826   // In practice the chain of fetches doesn't seem to impact performance, however.
 827   xorptr(boxReg, boxReg);
 828   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 829   jccb  (Assembler::notZero, DONE_LABEL);
 830   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 831   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 832   jccb  (Assembler::notZero, DONE_LABEL);
 833   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 834   jmpb  (DONE_LABEL);
 835 #else // _LP64
 836   // It's inflated
 837   Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath;
 838 
 839   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 840   jccb(Assembler::equal, LNotRecursive);
 841 
 842   // Recursive inflated unlock
 843   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 844   jmpb(LSuccess);
 845 
 846   bind(LNotRecursive);
 847   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 848   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 849   jccb  (Assembler::notZero, CheckSucc);
 850   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 851   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 852   jmpb  (DONE_LABEL);
 853 
 854   // Try to avoid passing control into the slow_path ...
 855   bind  (CheckSucc);
 856 
 857   // The following optional optimization can be elided if necessary
 858   // Effectively: if (succ == null) goto slow path
 859   // The code reduces the window for a race, however,
 860   // and thus benefits performance.
 861   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 862   jccb  (Assembler::zero, LGoSlowPath);
 863 
 864   xorptr(boxReg, boxReg);
 865   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 866   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 867 
 868   // Memory barrier/fence
 869   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 870   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 871   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 872   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 873   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 874   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 875   lock(); addl(Address(rsp, 0), 0);
 876 
 877   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 878   jccb  (Assembler::notZero, LSuccess);
 879 
 880   // Rare inopportune interleaving - race.
 881   // The successor vanished in the small window above.
 882   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 883   // We need to ensure progress and succession.
 884   // Try to reacquire the lock.
 885   // If that fails then the new owner is responsible for succession and this
 886   // thread needs to take no further action and can exit via the fast path (success).
 887   // If the re-acquire succeeds then pass control into the slow path.
 888   // As implemented, this latter mode is horrible because we generated more
 889   // coherence traffic on the lock *and* artificially extended the critical section
 890   // length while by virtue of passing control into the slow path.
 891 
 892   // box is really RAX -- the following CMPXCHG depends on that binding
 893   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 894   lock();
 895   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 896   // There's no successor so we tried to regrab the lock.
 897   // If that didn't work, then another thread grabbed the
 898   // lock so we're done (and exit was a success).
 899   jccb  (Assembler::notEqual, LSuccess);
 900   // Intentional fall-through into slow path
 901 
 902   bind  (LGoSlowPath);
 903   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 904   jmpb  (DONE_LABEL);
 905 
 906   bind  (LSuccess);
 907   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 908   jmpb  (DONE_LABEL);
 909 
 910 #endif
 911   if (LockingMode == LM_LEGACY) {
 912     bind  (Stacked);
 913     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 914     lock();
 915     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 916     // Intentional fall-thru into DONE_LABEL
 917   }
 918 
 919   bind(DONE_LABEL);
 920 
 921   // ZFlag == 1 count in fast path
 922   // ZFlag == 0 count in slow path
 923   jccb(Assembler::notZero, NO_COUNT);
 924 
 925   bind(COUNT);
 926   // Count monitors in fast path
 927 #ifndef _LP64
 928   get_thread(tmpReg);
 929   decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 930 #else // _LP64
 931   decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 932 #endif
 933 
 934   xorl(tmpReg, tmpReg); // Set ZF == 1
 935 
 936   bind(NO_COUNT);
 937 }
 938 
 939 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 940                                               Register t, Register thread) {
 941   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 942   assert(rax_reg == rax, "Used for CAS");
 943   assert_different_registers(obj, box, rax_reg, t, thread);
 944 
 945   // Handle inflated monitor.
 946   Label inflated;
 947   // Finish fast lock successfully. ZF value is irrelevant.
 948   Label locked;
 949   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 950   Label slow_path;
 951 
 952   if (DiagnoseSyncOnValueBasedClasses != 0) {
 953     load_klass(rax_reg, obj, t);
 954     movl(rax_reg, Address(rax_reg, Klass::access_flags_offset()));
 955     testl(rax_reg, JVM_ACC_IS_VALUE_BASED_CLASS);
 956     jcc(Assembler::notZero, slow_path);
 957   }
 958 
 959   const Register mark = t;
 960 
 961   { // Lightweight Lock
 962 
 963     Label push;
 964 
 965     const Register top = box;
 966 
 967     // Load the mark.
 968     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 969 
 970     // Prefetch top.
 971     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 972 
 973     // Check for monitor (0b10).
 974     testptr(mark, markWord::monitor_value);
 975     jcc(Assembler::notZero, inflated);
 976 
 977     // Check if lock-stack is full.
 978     cmpl(top, LockStack::end_offset() - 1);
 979     jcc(Assembler::greater, slow_path);
 980 
 981     // Check if recursive.
 982     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 983     jccb(Assembler::equal, push);
 984 
 985     // Try to lock. Transition lock bits 0b01 => 0b00
 986     movptr(rax_reg, mark);
 987     orptr(rax_reg, markWord::unlocked_value);
 988     andptr(mark, ~(int32_t)markWord::unlocked_value);
 989     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 990     jcc(Assembler::notEqual, slow_path);
 991 
 992     bind(push);
 993     // After successful lock, push object on lock-stack.
 994     movptr(Address(thread, top), obj);
 995     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 996     jmpb(locked);
 997   }
 998 
 999   { // Handle inflated monitor.
1000     bind(inflated);
1001 
1002     const Register tagged_monitor = mark;
1003 
1004     // CAS owner (null => current thread).
1005     xorptr(rax_reg, rax_reg);
1006     lock(); cmpxchgptr(thread, Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1007     jccb(Assembler::equal, locked);
1008 
1009     // Check if recursive.
1010     cmpptr(thread, rax_reg);
1011     jccb(Assembler::notEqual, slow_path);
1012 
1013     // Recursive.
1014     increment(Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1015   }
1016 
1017   bind(locked);
1018   increment(Address(thread, JavaThread::held_monitor_count_offset()));
1019   // Set ZF = 1
1020   xorl(rax_reg, rax_reg);
1021 
1022 #ifdef ASSERT
1023   // Check that locked label is reached with ZF set.
1024   Label zf_correct;
1025   jccb(Assembler::zero, zf_correct);
1026   stop("Fast Lock ZF != 1");
1027 #endif
1028 
1029   bind(slow_path);
1030 #ifdef ASSERT
1031   // Check that slow_path label is reached with ZF not set.
1032   jccb(Assembler::notZero, zf_correct);
1033   stop("Fast Lock ZF != 0");
1034   bind(zf_correct);
1035 #endif
1036   // C2 uses the value of ZF to determine the continuation.
1037 }
1038 
1039 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
1040   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
1041   assert(reg_rax == rax, "Used for CAS");
1042   assert_different_registers(obj, reg_rax, t);
1043 
1044   // Handle inflated monitor.
1045   Label inflated, inflated_check_lock_stack;
1046   // Finish fast unlock successfully.  MUST jump with ZF == 1
1047   Label unlocked;
1048 
1049   // Assume success.
1050   decrement(Address(thread, JavaThread::held_monitor_count_offset()));
1051 
1052   const Register mark = t;
1053   const Register top = reg_rax;
1054 
1055   Label dummy;
1056   C2FastUnlockLightweightStub* stub = nullptr;
1057 
1058   if (!Compile::current()->output()->in_scratch_emit_size()) {
1059     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
1060     Compile::current()->output()->add_stub(stub);
1061   }
1062 
1063   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
1064   Label& check_successor = stub == nullptr ? dummy : stub->check_successor();
1065 
1066   { // Lightweight Unlock
1067 
1068     // Load top.
1069     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
1070 
1071     // Prefetch mark.
1072     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
1073 
1074     // Check if obj is top of lock-stack.
1075     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
1076     // Top of lock stack was not obj. Must be monitor.
1077     jcc(Assembler::notEqual, inflated_check_lock_stack);
1078 
1079     // Pop lock-stack.
1080     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
1081     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
1082 
1083     // Check if recursive.
1084     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
1085     jcc(Assembler::equal, unlocked);
1086 
1087     // We elide the monitor check, let the CAS fail instead.
1088 
1089     // Try to unlock. Transition lock bits 0b00 => 0b01
1090     movptr(reg_rax, mark);
1091     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
1092     orptr(mark, markWord::unlocked_value);
1093     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
1094     jcc(Assembler::notEqual, push_and_slow_path);
1095     jmp(unlocked);
1096   }
1097 
1098 
1099   { // Handle inflated monitor.
1100     bind(inflated_check_lock_stack);
1101 #ifdef ASSERT
1102     Label check_done;
1103     subl(top, oopSize);
1104     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
1105     jcc(Assembler::below, check_done);
1106     cmpptr(obj, Address(thread, top));
1107     jccb(Assembler::notEqual, inflated_check_lock_stack);
1108     stop("Fast Unlock lock on stack");
1109     bind(check_done);
1110     testptr(mark, markWord::monitor_value);
1111     jccb(Assembler::notZero, inflated);
1112     stop("Fast Unlock not monitor");
1113 #endif
1114 
1115     bind(inflated);
1116 
1117     // mark contains the tagged ObjectMonitor*.
1118     const Register monitor = mark;
1119 
1120 #ifndef _LP64
1121     // Check if recursive.
1122     xorptr(reg_rax, reg_rax);
1123     orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1124     jcc(Assembler::notZero, check_successor);
1125 
1126     // Check if the entry lists are empty.
1127     movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
1128     orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
1129     jcc(Assembler::notZero, check_successor);
1130 
1131     // Release lock.
1132     movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
1133 #else // _LP64
1134     Label recursive;
1135 
1136     // Check if recursive.
1137     cmpptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
1138     jccb(Assembler::notEqual, recursive);
1139 
1140     // Check if the entry lists are empty.
1141     movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
1142     orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
1143     jcc(Assembler::notZero, check_successor);
1144 
1145     // Release lock.
1146     movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
1147     jmpb(unlocked);
1148 
1149     // Recursive unlock.
1150     bind(recursive);
1151     decrement(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1152     xorl(t, t);
1153 #endif
1154   }
1155 
1156   bind(unlocked);
1157   if (stub != nullptr) {
1158     bind(stub->unlocked_continuation());
1159   }
1160 
1161 #ifdef ASSERT
1162   // Check that unlocked label is reached with ZF set.
1163   Label zf_correct;
1164   jccb(Assembler::zero, zf_correct);
1165   stop("Fast Unlock ZF != 1");
1166 #endif
1167 
1168   if (stub != nullptr) {
1169     bind(stub->slow_path_continuation());
1170   }
1171 #ifdef ASSERT
1172   // Check that stub->continuation() label is reached with ZF not set.
1173   jccb(Assembler::notZero, zf_correct);
1174   stop("Fast Unlock ZF != 0");
1175   bind(zf_correct);
1176 #endif
1177   // C2 uses the value of ZF to determine the continuation.
1178 }
1179 
1180 //-------------------------------------------------------------------------------------------
1181 // Generic instructions support for use in .ad files C2 code generation
1182 
1183 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
1184   if (dst != src) {
1185     movdqu(dst, src);
1186   }
1187   if (opcode == Op_AbsVD) {
1188     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
1189   } else {
1190     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
1191     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1192   }
1193 }
1194 
1195 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
1196   if (opcode == Op_AbsVD) {
1197     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
1198   } else {
1199     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
1200     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
1201   }
1202 }
1203 
1204 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
1205   if (dst != src) {
1206     movdqu(dst, src);
1207   }
1208   if (opcode == Op_AbsVF) {
1209     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
1210   } else {
1211     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
1212     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1213   }
1214 }
1215 
1216 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
1217   if (opcode == Op_AbsVF) {
1218     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
1219   } else {
1220     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
1221     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
1222   }
1223 }
1224 
1225 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
1226   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1227   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
1228 
1229   if (opcode == Op_MinV) {
1230     if (elem_bt == T_BYTE) {
1231       pminsb(dst, src);
1232     } else if (elem_bt == T_SHORT) {
1233       pminsw(dst, src);
1234     } else if (elem_bt == T_INT) {
1235       pminsd(dst, src);
1236     } else {
1237       assert(elem_bt == T_LONG, "required");
1238       assert(tmp == xmm0, "required");
1239       assert_different_registers(dst, src, tmp);
1240       movdqu(xmm0, dst);
1241       pcmpgtq(xmm0, src);
1242       blendvpd(dst, src);  // xmm0 as mask
1243     }
1244   } else { // opcode == Op_MaxV
1245     if (elem_bt == T_BYTE) {
1246       pmaxsb(dst, src);
1247     } else if (elem_bt == T_SHORT) {
1248       pmaxsw(dst, src);
1249     } else if (elem_bt == T_INT) {
1250       pmaxsd(dst, src);
1251     } else {
1252       assert(elem_bt == T_LONG, "required");
1253       assert(tmp == xmm0, "required");
1254       assert_different_registers(dst, src, tmp);
1255       movdqu(xmm0, src);
1256       pcmpgtq(xmm0, dst);
1257       blendvpd(dst, src);  // xmm0 as mask
1258     }
1259   }
1260 }
1261 
1262 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1263                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1264                                  int vlen_enc) {
1265   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1266 
1267   if (opcode == Op_MinV) {
1268     if (elem_bt == T_BYTE) {
1269       vpminsb(dst, src1, src2, vlen_enc);
1270     } else if (elem_bt == T_SHORT) {
1271       vpminsw(dst, src1, src2, vlen_enc);
1272     } else if (elem_bt == T_INT) {
1273       vpminsd(dst, src1, src2, vlen_enc);
1274     } else {
1275       assert(elem_bt == T_LONG, "required");
1276       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1277         vpminsq(dst, src1, src2, vlen_enc);
1278       } else {
1279         assert_different_registers(dst, src1, src2);
1280         vpcmpgtq(dst, src1, src2, vlen_enc);
1281         vblendvpd(dst, src1, src2, dst, vlen_enc);
1282       }
1283     }
1284   } else { // opcode == Op_MaxV
1285     if (elem_bt == T_BYTE) {
1286       vpmaxsb(dst, src1, src2, vlen_enc);
1287     } else if (elem_bt == T_SHORT) {
1288       vpmaxsw(dst, src1, src2, vlen_enc);
1289     } else if (elem_bt == T_INT) {
1290       vpmaxsd(dst, src1, src2, vlen_enc);
1291     } else {
1292       assert(elem_bt == T_LONG, "required");
1293       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1294         vpmaxsq(dst, src1, src2, vlen_enc);
1295       } else {
1296         assert_different_registers(dst, src1, src2);
1297         vpcmpgtq(dst, src1, src2, vlen_enc);
1298         vblendvpd(dst, src2, src1, dst, vlen_enc);
1299       }
1300     }
1301   }
1302 }
1303 
1304 // Float/Double min max
1305 
1306 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1307                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1308                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1309                                    int vlen_enc) {
1310   assert(UseAVX > 0, "required");
1311   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1312          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1313   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1314   assert_different_registers(a, tmp, atmp, btmp);
1315   assert_different_registers(b, tmp, atmp, btmp);
1316 
1317   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1318   bool is_double_word = is_double_word_type(elem_bt);
1319 
1320   /* Note on 'non-obvious' assembly sequence:
1321    *
1322    * While there are vminps/vmaxps instructions, there are two important differences between hardware
1323    * and Java on how they handle floats:
1324    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
1325    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
1326    *
1327    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
1328    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
1329    *                (only useful when signs differ, noop otherwise)
1330    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
1331 
1332    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
1333    *   btmp = (b < +0.0) ? a : b
1334    *   atmp = (b < +0.0) ? b : a
1335    *   Tmp  = Max_Float(atmp , btmp)
1336    *   Res  = (atmp == NaN) ? atmp : Tmp
1337    */
1338 
1339   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
1340   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
1341   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
1342   XMMRegister mask;
1343 
1344   if (!is_double_word && is_min) {
1345     mask = a;
1346     vblend = &MacroAssembler::vblendvps;
1347     vmaxmin = &MacroAssembler::vminps;
1348     vcmp = &MacroAssembler::vcmpps;
1349   } else if (!is_double_word && !is_min) {
1350     mask = b;
1351     vblend = &MacroAssembler::vblendvps;
1352     vmaxmin = &MacroAssembler::vmaxps;
1353     vcmp = &MacroAssembler::vcmpps;
1354   } else if (is_double_word && is_min) {
1355     mask = a;
1356     vblend = &MacroAssembler::vblendvpd;
1357     vmaxmin = &MacroAssembler::vminpd;
1358     vcmp = &MacroAssembler::vcmppd;
1359   } else {
1360     assert(is_double_word && !is_min, "sanity");
1361     mask = b;
1362     vblend = &MacroAssembler::vblendvpd;
1363     vmaxmin = &MacroAssembler::vmaxpd;
1364     vcmp = &MacroAssembler::vcmppd;
1365   }
1366 
1367   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1368   XMMRegister maxmin, scratch;
1369   if (dst == btmp) {
1370     maxmin = btmp;
1371     scratch = tmp;
1372   } else {
1373     maxmin = tmp;
1374     scratch = btmp;
1375   }
1376 
1377   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1378   if (precompute_mask && !is_double_word) {
1379     vpsrad(tmp, mask, 32, vlen_enc);
1380     mask = tmp;
1381   } else if (precompute_mask && is_double_word) {
1382     vpxor(tmp, tmp, tmp, vlen_enc);
1383     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1384     mask = tmp;
1385   }
1386 
1387   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1388   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1389   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1390   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1391   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1392 }
1393 
1394 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1395                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1396                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1397                                     int vlen_enc) {
1398   assert(UseAVX > 2, "required");
1399   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1400          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1401   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1402   assert_different_registers(dst, a, atmp, btmp);
1403   assert_different_registers(dst, b, atmp, btmp);
1404 
1405   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1406   bool is_double_word = is_double_word_type(elem_bt);
1407   bool merge = true;
1408 
1409   if (!is_double_word && is_min) {
1410     evpmovd2m(ktmp, a, vlen_enc);
1411     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1412     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1413     vminps(dst, atmp, btmp, vlen_enc);
1414     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1415     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1416   } else if (!is_double_word && !is_min) {
1417     evpmovd2m(ktmp, b, vlen_enc);
1418     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1419     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1420     vmaxps(dst, atmp, btmp, vlen_enc);
1421     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1422     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1423   } else if (is_double_word && is_min) {
1424     evpmovq2m(ktmp, a, vlen_enc);
1425     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1426     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1427     vminpd(dst, atmp, btmp, vlen_enc);
1428     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1429     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1430   } else {
1431     assert(is_double_word && !is_min, "sanity");
1432     evpmovq2m(ktmp, b, vlen_enc);
1433     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1434     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1435     vmaxpd(dst, atmp, btmp, vlen_enc);
1436     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1437     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1438   }
1439 }
1440 
1441 // Float/Double signum
1442 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1443   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1444 
1445   Label DONE_LABEL;
1446 
1447   if (opcode == Op_SignumF) {
1448     assert(UseSSE > 0, "required");
1449     ucomiss(dst, zero);
1450     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1451     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1452     movflt(dst, one);
1453     jcc(Assembler::above, DONE_LABEL);
1454     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1455   } else if (opcode == Op_SignumD) {
1456     assert(UseSSE > 1, "required");
1457     ucomisd(dst, zero);
1458     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1459     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1460     movdbl(dst, one);
1461     jcc(Assembler::above, DONE_LABEL);
1462     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1463   }
1464 
1465   bind(DONE_LABEL);
1466 }
1467 
1468 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1469   if (sign) {
1470     pmovsxbw(dst, src);
1471   } else {
1472     pmovzxbw(dst, src);
1473   }
1474 }
1475 
1476 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1477   if (sign) {
1478     vpmovsxbw(dst, src, vector_len);
1479   } else {
1480     vpmovzxbw(dst, src, vector_len);
1481   }
1482 }
1483 
1484 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1485   if (sign) {
1486     vpmovsxbd(dst, src, vector_len);
1487   } else {
1488     vpmovzxbd(dst, src, vector_len);
1489   }
1490 }
1491 
1492 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1493   if (sign) {
1494     vpmovsxwd(dst, src, vector_len);
1495   } else {
1496     vpmovzxwd(dst, src, vector_len);
1497   }
1498 }
1499 
1500 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1501                                      int shift, int vector_len) {
1502   if (opcode == Op_RotateLeftV) {
1503     if (etype == T_INT) {
1504       evprold(dst, src, shift, vector_len);
1505     } else {
1506       assert(etype == T_LONG, "expected type T_LONG");
1507       evprolq(dst, src, shift, vector_len);
1508     }
1509   } else {
1510     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1511     if (etype == T_INT) {
1512       evprord(dst, src, shift, vector_len);
1513     } else {
1514       assert(etype == T_LONG, "expected type T_LONG");
1515       evprorq(dst, src, shift, vector_len);
1516     }
1517   }
1518 }
1519 
1520 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1521                                      XMMRegister shift, int vector_len) {
1522   if (opcode == Op_RotateLeftV) {
1523     if (etype == T_INT) {
1524       evprolvd(dst, src, shift, vector_len);
1525     } else {
1526       assert(etype == T_LONG, "expected type T_LONG");
1527       evprolvq(dst, src, shift, vector_len);
1528     }
1529   } else {
1530     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1531     if (etype == T_INT) {
1532       evprorvd(dst, src, shift, vector_len);
1533     } else {
1534       assert(etype == T_LONG, "expected type T_LONG");
1535       evprorvq(dst, src, shift, vector_len);
1536     }
1537   }
1538 }
1539 
1540 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1541   if (opcode == Op_RShiftVI) {
1542     psrad(dst, shift);
1543   } else if (opcode == Op_LShiftVI) {
1544     pslld(dst, shift);
1545   } else {
1546     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1547     psrld(dst, shift);
1548   }
1549 }
1550 
1551 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1552   switch (opcode) {
1553     case Op_RShiftVI:  psrad(dst, shift); break;
1554     case Op_LShiftVI:  pslld(dst, shift); break;
1555     case Op_URShiftVI: psrld(dst, shift); break;
1556 
1557     default: assert(false, "%s", NodeClassNames[opcode]);
1558   }
1559 }
1560 
1561 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1562   if (opcode == Op_RShiftVI) {
1563     vpsrad(dst, nds, shift, vector_len);
1564   } else if (opcode == Op_LShiftVI) {
1565     vpslld(dst, nds, shift, vector_len);
1566   } else {
1567     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1568     vpsrld(dst, nds, shift, vector_len);
1569   }
1570 }
1571 
1572 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1573   switch (opcode) {
1574     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1575     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1576     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1577 
1578     default: assert(false, "%s", NodeClassNames[opcode]);
1579   }
1580 }
1581 
1582 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1583   switch (opcode) {
1584     case Op_RShiftVB:  // fall-through
1585     case Op_RShiftVS:  psraw(dst, shift); break;
1586 
1587     case Op_LShiftVB:  // fall-through
1588     case Op_LShiftVS:  psllw(dst, shift);   break;
1589 
1590     case Op_URShiftVS: // fall-through
1591     case Op_URShiftVB: psrlw(dst, shift);  break;
1592 
1593     default: assert(false, "%s", NodeClassNames[opcode]);
1594   }
1595 }
1596 
1597 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1598   switch (opcode) {
1599     case Op_RShiftVB:  // fall-through
1600     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1601 
1602     case Op_LShiftVB:  // fall-through
1603     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1604 
1605     case Op_URShiftVS: // fall-through
1606     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1607 
1608     default: assert(false, "%s", NodeClassNames[opcode]);
1609   }
1610 }
1611 
1612 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1613   switch (opcode) {
1614     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1615     case Op_LShiftVL:  psllq(dst, shift); break;
1616     case Op_URShiftVL: psrlq(dst, shift); break;
1617 
1618     default: assert(false, "%s", NodeClassNames[opcode]);
1619   }
1620 }
1621 
1622 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1623   if (opcode == Op_RShiftVL) {
1624     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1625   } else if (opcode == Op_LShiftVL) {
1626     psllq(dst, shift);
1627   } else {
1628     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1629     psrlq(dst, shift);
1630   }
1631 }
1632 
1633 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1634   switch (opcode) {
1635     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1636     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1637     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1638 
1639     default: assert(false, "%s", NodeClassNames[opcode]);
1640   }
1641 }
1642 
1643 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1644   if (opcode == Op_RShiftVL) {
1645     evpsraq(dst, nds, shift, vector_len);
1646   } else if (opcode == Op_LShiftVL) {
1647     vpsllq(dst, nds, shift, vector_len);
1648   } else {
1649     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1650     vpsrlq(dst, nds, shift, vector_len);
1651   }
1652 }
1653 
1654 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1655   switch (opcode) {
1656     case Op_RShiftVB:  // fall-through
1657     case Op_RShiftVS:  // fall-through
1658     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1659 
1660     case Op_LShiftVB:  // fall-through
1661     case Op_LShiftVS:  // fall-through
1662     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1663 
1664     case Op_URShiftVB: // fall-through
1665     case Op_URShiftVS: // fall-through
1666     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1667 
1668     default: assert(false, "%s", NodeClassNames[opcode]);
1669   }
1670 }
1671 
1672 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1673   switch (opcode) {
1674     case Op_RShiftVB:  // fall-through
1675     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1676 
1677     case Op_LShiftVB:  // fall-through
1678     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1679 
1680     case Op_URShiftVB: // fall-through
1681     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1682 
1683     default: assert(false, "%s", NodeClassNames[opcode]);
1684   }
1685 }
1686 
1687 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1688   assert(UseAVX >= 2, "required");
1689   switch (opcode) {
1690     case Op_RShiftVL: {
1691       if (UseAVX > 2) {
1692         assert(tmp == xnoreg, "not used");
1693         if (!VM_Version::supports_avx512vl()) {
1694           vlen_enc = Assembler::AVX_512bit;
1695         }
1696         evpsravq(dst, src, shift, vlen_enc);
1697       } else {
1698         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1699         vpsrlvq(dst, src, shift, vlen_enc);
1700         vpsrlvq(tmp, tmp, shift, vlen_enc);
1701         vpxor(dst, dst, tmp, vlen_enc);
1702         vpsubq(dst, dst, tmp, vlen_enc);
1703       }
1704       break;
1705     }
1706     case Op_LShiftVL: {
1707       assert(tmp == xnoreg, "not used");
1708       vpsllvq(dst, src, shift, vlen_enc);
1709       break;
1710     }
1711     case Op_URShiftVL: {
1712       assert(tmp == xnoreg, "not used");
1713       vpsrlvq(dst, src, shift, vlen_enc);
1714       break;
1715     }
1716     default: assert(false, "%s", NodeClassNames[opcode]);
1717   }
1718 }
1719 
1720 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1721 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1722   assert(opcode == Op_LShiftVB ||
1723          opcode == Op_RShiftVB ||
1724          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1725   bool sign = (opcode != Op_URShiftVB);
1726   assert(vector_len == 0, "required");
1727   vextendbd(sign, dst, src, 1);
1728   vpmovzxbd(vtmp, shift, 1);
1729   varshiftd(opcode, dst, dst, vtmp, 1);
1730   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1731   vextracti128_high(vtmp, dst);
1732   vpackusdw(dst, dst, vtmp, 0);
1733 }
1734 
1735 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1736 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1737   assert(opcode == Op_LShiftVB ||
1738          opcode == Op_RShiftVB ||
1739          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1740   bool sign = (opcode != Op_URShiftVB);
1741   int ext_vector_len = vector_len + 1;
1742   vextendbw(sign, dst, src, ext_vector_len);
1743   vpmovzxbw(vtmp, shift, ext_vector_len);
1744   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1745   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1746   if (vector_len == 0) {
1747     vextracti128_high(vtmp, dst);
1748     vpackuswb(dst, dst, vtmp, vector_len);
1749   } else {
1750     vextracti64x4_high(vtmp, dst);
1751     vpackuswb(dst, dst, vtmp, vector_len);
1752     vpermq(dst, dst, 0xD8, vector_len);
1753   }
1754 }
1755 
1756 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1757   switch(typ) {
1758     case T_BYTE:
1759       pinsrb(dst, val, idx);
1760       break;
1761     case T_SHORT:
1762       pinsrw(dst, val, idx);
1763       break;
1764     case T_INT:
1765       pinsrd(dst, val, idx);
1766       break;
1767     case T_LONG:
1768       pinsrq(dst, val, idx);
1769       break;
1770     default:
1771       assert(false,"Should not reach here.");
1772       break;
1773   }
1774 }
1775 
1776 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1777   switch(typ) {
1778     case T_BYTE:
1779       vpinsrb(dst, src, val, idx);
1780       break;
1781     case T_SHORT:
1782       vpinsrw(dst, src, val, idx);
1783       break;
1784     case T_INT:
1785       vpinsrd(dst, src, val, idx);
1786       break;
1787     case T_LONG:
1788       vpinsrq(dst, src, val, idx);
1789       break;
1790     default:
1791       assert(false,"Should not reach here.");
1792       break;
1793   }
1794 }
1795 
1796 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1797   switch(typ) {
1798     case T_INT:
1799       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1800       break;
1801     case T_FLOAT:
1802       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1803       break;
1804     case T_LONG:
1805       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1806       break;
1807     case T_DOUBLE:
1808       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1809       break;
1810     default:
1811       assert(false,"Should not reach here.");
1812       break;
1813   }
1814 }
1815 
1816 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1817   switch(typ) {
1818     case T_INT:
1819       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1820       break;
1821     case T_FLOAT:
1822       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1823       break;
1824     case T_LONG:
1825       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1826       break;
1827     case T_DOUBLE:
1828       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1829       break;
1830     default:
1831       assert(false,"Should not reach here.");
1832       break;
1833   }
1834 }
1835 
1836 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1837   switch(typ) {
1838     case T_INT:
1839       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1840       break;
1841     case T_FLOAT:
1842       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1843       break;
1844     case T_LONG:
1845       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1846       break;
1847     case T_DOUBLE:
1848       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1849       break;
1850     default:
1851       assert(false,"Should not reach here.");
1852       break;
1853   }
1854 }
1855 
1856 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1857   if (vlen_in_bytes <= 16) {
1858     pxor (dst, dst);
1859     psubb(dst, src);
1860     switch (elem_bt) {
1861       case T_BYTE:   /* nothing to do */ break;
1862       case T_SHORT:  pmovsxbw(dst, dst); break;
1863       case T_INT:    pmovsxbd(dst, dst); break;
1864       case T_FLOAT:  pmovsxbd(dst, dst); break;
1865       case T_LONG:   pmovsxbq(dst, dst); break;
1866       case T_DOUBLE: pmovsxbq(dst, dst); break;
1867 
1868       default: assert(false, "%s", type2name(elem_bt));
1869     }
1870   } else {
1871     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1872     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1873 
1874     vpxor (dst, dst, dst, vlen_enc);
1875     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1876 
1877     switch (elem_bt) {
1878       case T_BYTE:   /* nothing to do */            break;
1879       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1880       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1881       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1882       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1883       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1884 
1885       default: assert(false, "%s", type2name(elem_bt));
1886     }
1887   }
1888 }
1889 
1890 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1891   if (novlbwdq) {
1892     vpmovsxbd(xtmp, src, vlen_enc);
1893     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1894             Assembler::eq, true, vlen_enc, noreg);
1895   } else {
1896     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1897     vpsubb(xtmp, xtmp, src, vlen_enc);
1898     evpmovb2m(dst, xtmp, vlen_enc);
1899   }
1900 }
1901 
1902 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
1903   switch (vlen_in_bytes) {
1904     case 4:  movdl(dst, src);   break;
1905     case 8:  movq(dst, src);    break;
1906     case 16: movdqu(dst, src);  break;
1907     case 32: vmovdqu(dst, src); break;
1908     case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1909     default: ShouldNotReachHere();
1910   }
1911 }
1912 
1913 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1914   assert(rscratch != noreg || always_reachable(src), "missing");
1915 
1916   if (reachable(src)) {
1917     load_vector(dst, as_Address(src), vlen_in_bytes);
1918   } else {
1919     lea(rscratch, src);
1920     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
1921   }
1922 }
1923 
1924 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1925   int vlen_enc = vector_length_encoding(vlen);
1926   if (VM_Version::supports_avx()) {
1927     if (bt == T_LONG) {
1928       if (VM_Version::supports_avx2()) {
1929         vpbroadcastq(dst, src, vlen_enc);
1930       } else {
1931         vmovddup(dst, src, vlen_enc);
1932       }
1933     } else if (bt == T_DOUBLE) {
1934       if (vlen_enc != Assembler::AVX_128bit) {
1935         vbroadcastsd(dst, src, vlen_enc, noreg);
1936       } else {
1937         vmovddup(dst, src, vlen_enc);
1938       }
1939     } else {
1940       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1941         vpbroadcastd(dst, src, vlen_enc);
1942       } else {
1943         vbroadcastss(dst, src, vlen_enc);
1944       }
1945     }
1946   } else if (VM_Version::supports_sse3()) {
1947     movddup(dst, src);
1948   } else {
1949     movq(dst, src);
1950     if (vlen == 16) {
1951       punpcklqdq(dst, dst);
1952     }
1953   }
1954 }
1955 
1956 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1957   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1958   int offset = exact_log2(type2aelembytes(bt)) << 6;
1959   if (is_floating_point_type(bt)) {
1960     offset += 128;
1961   }
1962   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1963   load_vector(dst, addr, vlen_in_bytes);
1964 }
1965 
1966 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1967 
1968 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1969   int vector_len = Assembler::AVX_128bit;
1970 
1971   switch (opcode) {
1972     case Op_AndReductionV:  pand(dst, src); break;
1973     case Op_OrReductionV:   por (dst, src); break;
1974     case Op_XorReductionV:  pxor(dst, src); break;
1975     case Op_MinReductionV:
1976       switch (typ) {
1977         case T_BYTE:        pminsb(dst, src); break;
1978         case T_SHORT:       pminsw(dst, src); break;
1979         case T_INT:         pminsd(dst, src); break;
1980         case T_LONG:        assert(UseAVX > 2, "required");
1981                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1982         default:            assert(false, "wrong type");
1983       }
1984       break;
1985     case Op_MaxReductionV:
1986       switch (typ) {
1987         case T_BYTE:        pmaxsb(dst, src); break;
1988         case T_SHORT:       pmaxsw(dst, src); break;
1989         case T_INT:         pmaxsd(dst, src); break;
1990         case T_LONG:        assert(UseAVX > 2, "required");
1991                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1992         default:            assert(false, "wrong type");
1993       }
1994       break;
1995     case Op_AddReductionVF: addss(dst, src); break;
1996     case Op_AddReductionVD: addsd(dst, src); break;
1997     case Op_AddReductionVI:
1998       switch (typ) {
1999         case T_BYTE:        paddb(dst, src); break;
2000         case T_SHORT:       paddw(dst, src); break;
2001         case T_INT:         paddd(dst, src); break;
2002         default:            assert(false, "wrong type");
2003       }
2004       break;
2005     case Op_AddReductionVL: paddq(dst, src); break;
2006     case Op_MulReductionVF: mulss(dst, src); break;
2007     case Op_MulReductionVD: mulsd(dst, src); break;
2008     case Op_MulReductionVI:
2009       switch (typ) {
2010         case T_SHORT:       pmullw(dst, src); break;
2011         case T_INT:         pmulld(dst, src); break;
2012         default:            assert(false, "wrong type");
2013       }
2014       break;
2015     case Op_MulReductionVL: assert(UseAVX > 2, "required");
2016                             evpmullq(dst, dst, src, vector_len); break;
2017     default:                assert(false, "wrong opcode");
2018   }
2019 }
2020 
2021 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
2022   int vector_len = Assembler::AVX_256bit;
2023 
2024   switch (opcode) {
2025     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
2026     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
2027     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
2028     case Op_MinReductionV:
2029       switch (typ) {
2030         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
2031         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
2032         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
2033         case T_LONG:        assert(UseAVX > 2, "required");
2034                             vpminsq(dst, src1, src2, vector_len); break;
2035         default:            assert(false, "wrong type");
2036       }
2037       break;
2038     case Op_MaxReductionV:
2039       switch (typ) {
2040         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
2041         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
2042         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
2043         case T_LONG:        assert(UseAVX > 2, "required");
2044                             vpmaxsq(dst, src1, src2, vector_len); break;
2045         default:            assert(false, "wrong type");
2046       }
2047       break;
2048     case Op_AddReductionVI:
2049       switch (typ) {
2050         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
2051         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
2052         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
2053         default:            assert(false, "wrong type");
2054       }
2055       break;
2056     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
2057     case Op_MulReductionVI:
2058       switch (typ) {
2059         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
2060         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
2061         default:            assert(false, "wrong type");
2062       }
2063       break;
2064     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
2065     default:                assert(false, "wrong opcode");
2066   }
2067 }
2068 
2069 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
2070                                   XMMRegister dst, XMMRegister src,
2071                                   XMMRegister vtmp1, XMMRegister vtmp2) {
2072   switch (opcode) {
2073     case Op_AddReductionVF:
2074     case Op_MulReductionVF:
2075       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
2076       break;
2077 
2078     case Op_AddReductionVD:
2079     case Op_MulReductionVD:
2080       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
2081       break;
2082 
2083     default: assert(false, "wrong opcode");
2084   }
2085 }
2086 
2087 void C2_MacroAssembler::reduceB(int opcode, int vlen,
2088                              Register dst, Register src1, XMMRegister src2,
2089                              XMMRegister vtmp1, XMMRegister vtmp2) {
2090   switch (vlen) {
2091     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2092     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2093     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2094     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2095 
2096     default: assert(false, "wrong vector length");
2097   }
2098 }
2099 
2100 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
2101                              Register dst, Register src1, XMMRegister src2,
2102                              XMMRegister vtmp1, XMMRegister vtmp2) {
2103   switch (vlen) {
2104     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2105     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2106     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2107     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2108 
2109     default: assert(false, "wrong vector length");
2110   }
2111 }
2112 
2113 void C2_MacroAssembler::reduceS(int opcode, int vlen,
2114                              Register dst, Register src1, XMMRegister src2,
2115                              XMMRegister vtmp1, XMMRegister vtmp2) {
2116   switch (vlen) {
2117     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2118     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2119     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2120     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2121 
2122     default: assert(false, "wrong vector length");
2123   }
2124 }
2125 
2126 void C2_MacroAssembler::reduceI(int opcode, int vlen,
2127                              Register dst, Register src1, XMMRegister src2,
2128                              XMMRegister vtmp1, XMMRegister vtmp2) {
2129   switch (vlen) {
2130     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2131     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2132     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2133     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2134 
2135     default: assert(false, "wrong vector length");
2136   }
2137 }
2138 
2139 #ifdef _LP64
2140 void C2_MacroAssembler::reduceL(int opcode, int vlen,
2141                              Register dst, Register src1, XMMRegister src2,
2142                              XMMRegister vtmp1, XMMRegister vtmp2) {
2143   switch (vlen) {
2144     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2145     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2146     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2147 
2148     default: assert(false, "wrong vector length");
2149   }
2150 }
2151 #endif // _LP64
2152 
2153 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2154   switch (vlen) {
2155     case 2:
2156       assert(vtmp2 == xnoreg, "");
2157       reduce2F(opcode, dst, src, vtmp1);
2158       break;
2159     case 4:
2160       assert(vtmp2 == xnoreg, "");
2161       reduce4F(opcode, dst, src, vtmp1);
2162       break;
2163     case 8:
2164       reduce8F(opcode, dst, src, vtmp1, vtmp2);
2165       break;
2166     case 16:
2167       reduce16F(opcode, dst, src, vtmp1, vtmp2);
2168       break;
2169     default: assert(false, "wrong vector length");
2170   }
2171 }
2172 
2173 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2174   switch (vlen) {
2175     case 2:
2176       assert(vtmp2 == xnoreg, "");
2177       reduce2D(opcode, dst, src, vtmp1);
2178       break;
2179     case 4:
2180       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2181       break;
2182     case 8:
2183       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2184       break;
2185     default: assert(false, "wrong vector length");
2186   }
2187 }
2188 
2189 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2190   if (opcode == Op_AddReductionVI) {
2191     if (vtmp1 != src2) {
2192       movdqu(vtmp1, src2);
2193     }
2194     phaddd(vtmp1, vtmp1);
2195   } else {
2196     pshufd(vtmp1, src2, 0x1);
2197     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2198   }
2199   movdl(vtmp2, src1);
2200   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2201   movdl(dst, vtmp1);
2202 }
2203 
2204 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2205   if (opcode == Op_AddReductionVI) {
2206     if (vtmp1 != src2) {
2207       movdqu(vtmp1, src2);
2208     }
2209     phaddd(vtmp1, src2);
2210     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2211   } else {
2212     pshufd(vtmp2, src2, 0xE);
2213     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2214     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2215   }
2216 }
2217 
2218 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2219   if (opcode == Op_AddReductionVI) {
2220     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2221     vextracti128_high(vtmp2, vtmp1);
2222     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2223     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2224   } else {
2225     vextracti128_high(vtmp1, src2);
2226     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2227     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2228   }
2229 }
2230 
2231 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2232   vextracti64x4_high(vtmp2, src2);
2233   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2234   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2235 }
2236 
2237 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2238   pshufd(vtmp2, src2, 0x1);
2239   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2240   movdqu(vtmp1, vtmp2);
2241   psrldq(vtmp1, 2);
2242   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2243   movdqu(vtmp2, vtmp1);
2244   psrldq(vtmp2, 1);
2245   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2246   movdl(vtmp2, src1);
2247   pmovsxbd(vtmp1, vtmp1);
2248   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2249   pextrb(dst, vtmp1, 0x0);
2250   movsbl(dst, dst);
2251 }
2252 
2253 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2254   pshufd(vtmp1, src2, 0xE);
2255   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2256   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2257 }
2258 
2259 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2260   vextracti128_high(vtmp2, src2);
2261   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2262   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2263 }
2264 
2265 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2266   vextracti64x4_high(vtmp1, src2);
2267   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2268   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2269 }
2270 
2271 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2272   pmovsxbw(vtmp2, src2);
2273   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2274 }
2275 
2276 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2277   if (UseAVX > 1) {
2278     int vector_len = Assembler::AVX_256bit;
2279     vpmovsxbw(vtmp1, src2, vector_len);
2280     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2281   } else {
2282     pmovsxbw(vtmp2, src2);
2283     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2284     pshufd(vtmp2, src2, 0x1);
2285     pmovsxbw(vtmp2, src2);
2286     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2287   }
2288 }
2289 
2290 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2291   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2292     int vector_len = Assembler::AVX_512bit;
2293     vpmovsxbw(vtmp1, src2, vector_len);
2294     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2295   } else {
2296     assert(UseAVX >= 2,"Should not reach here.");
2297     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2298     vextracti128_high(vtmp2, src2);
2299     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2300   }
2301 }
2302 
2303 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2304   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2305   vextracti64x4_high(vtmp2, src2);
2306   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2307 }
2308 
2309 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2310   if (opcode == Op_AddReductionVI) {
2311     if (vtmp1 != src2) {
2312       movdqu(vtmp1, src2);
2313     }
2314     phaddw(vtmp1, vtmp1);
2315     phaddw(vtmp1, vtmp1);
2316   } else {
2317     pshufd(vtmp2, src2, 0x1);
2318     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2319     movdqu(vtmp1, vtmp2);
2320     psrldq(vtmp1, 2);
2321     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2322   }
2323   movdl(vtmp2, src1);
2324   pmovsxwd(vtmp1, vtmp1);
2325   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2326   pextrw(dst, vtmp1, 0x0);
2327   movswl(dst, dst);
2328 }
2329 
2330 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2331   if (opcode == Op_AddReductionVI) {
2332     if (vtmp1 != src2) {
2333       movdqu(vtmp1, src2);
2334     }
2335     phaddw(vtmp1, src2);
2336   } else {
2337     pshufd(vtmp1, src2, 0xE);
2338     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2339   }
2340   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2341 }
2342 
2343 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2344   if (opcode == Op_AddReductionVI) {
2345     int vector_len = Assembler::AVX_256bit;
2346     vphaddw(vtmp2, src2, src2, vector_len);
2347     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2348   } else {
2349     vextracti128_high(vtmp2, src2);
2350     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2351   }
2352   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2353 }
2354 
2355 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2356   int vector_len = Assembler::AVX_256bit;
2357   vextracti64x4_high(vtmp1, src2);
2358   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2359   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2360 }
2361 
2362 #ifdef _LP64
2363 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2364   pshufd(vtmp2, src2, 0xE);
2365   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2366   movdq(vtmp1, src1);
2367   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2368   movdq(dst, vtmp1);
2369 }
2370 
2371 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2372   vextracti128_high(vtmp1, src2);
2373   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2374   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2375 }
2376 
2377 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2378   vextracti64x4_high(vtmp2, src2);
2379   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2380   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2381 }
2382 
2383 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2384   mov64(temp, -1L);
2385   bzhiq(temp, temp, len);
2386   kmovql(dst, temp);
2387 }
2388 #endif // _LP64
2389 
2390 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2391   reduce_operation_128(T_FLOAT, opcode, dst, src);
2392   pshufd(vtmp, src, 0x1);
2393   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2394 }
2395 
2396 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2397   reduce2F(opcode, dst, src, vtmp);
2398   pshufd(vtmp, src, 0x2);
2399   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2400   pshufd(vtmp, src, 0x3);
2401   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2402 }
2403 
2404 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2405   reduce4F(opcode, dst, src, vtmp2);
2406   vextractf128_high(vtmp2, src);
2407   reduce4F(opcode, dst, vtmp2, vtmp1);
2408 }
2409 
2410 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2411   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2412   vextracti64x4_high(vtmp1, src);
2413   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2414 }
2415 
2416 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2417   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2418   pshufd(vtmp, src, 0xE);
2419   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2420 }
2421 
2422 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2423   reduce2D(opcode, dst, src, vtmp2);
2424   vextractf128_high(vtmp2, src);
2425   reduce2D(opcode, dst, vtmp2, vtmp1);
2426 }
2427 
2428 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2429   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2430   vextracti64x4_high(vtmp1, src);
2431   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2432 }
2433 
2434 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2435   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2436 }
2437 
2438 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2439   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2440 }
2441 
2442 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2443                                  int vec_enc) {
2444   switch(elem_bt) {
2445     case T_INT:
2446     case T_FLOAT:
2447       vmaskmovps(dst, src, mask, vec_enc);
2448       break;
2449     case T_LONG:
2450     case T_DOUBLE:
2451       vmaskmovpd(dst, src, mask, vec_enc);
2452       break;
2453     default:
2454       fatal("Unsupported type %s", type2name(elem_bt));
2455       break;
2456   }
2457 }
2458 
2459 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2460                                  int vec_enc) {
2461   switch(elem_bt) {
2462     case T_INT:
2463     case T_FLOAT:
2464       vmaskmovps(dst, src, mask, vec_enc);
2465       break;
2466     case T_LONG:
2467     case T_DOUBLE:
2468       vmaskmovpd(dst, src, mask, vec_enc);
2469       break;
2470     default:
2471       fatal("Unsupported type %s", type2name(elem_bt));
2472       break;
2473   }
2474 }
2475 
2476 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2477                                           XMMRegister dst, XMMRegister src,
2478                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2479                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2480   const int permconst[] = {1, 14};
2481   XMMRegister wsrc = src;
2482   XMMRegister wdst = xmm_0;
2483   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2484 
2485   int vlen_enc = Assembler::AVX_128bit;
2486   if (vlen == 16) {
2487     vlen_enc = Assembler::AVX_256bit;
2488   }
2489 
2490   for (int i = log2(vlen) - 1; i >=0; i--) {
2491     if (i == 0 && !is_dst_valid) {
2492       wdst = dst;
2493     }
2494     if (i == 3) {
2495       vextracti64x4_high(wtmp, wsrc);
2496     } else if (i == 2) {
2497       vextracti128_high(wtmp, wsrc);
2498     } else { // i = [0,1]
2499       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2500     }
2501     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2502     wsrc = wdst;
2503     vlen_enc = Assembler::AVX_128bit;
2504   }
2505   if (is_dst_valid) {
2506     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2507   }
2508 }
2509 
2510 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2511                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2512                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2513   XMMRegister wsrc = src;
2514   XMMRegister wdst = xmm_0;
2515   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2516   int vlen_enc = Assembler::AVX_128bit;
2517   if (vlen == 8) {
2518     vlen_enc = Assembler::AVX_256bit;
2519   }
2520   for (int i = log2(vlen) - 1; i >=0; i--) {
2521     if (i == 0 && !is_dst_valid) {
2522       wdst = dst;
2523     }
2524     if (i == 1) {
2525       vextracti128_high(wtmp, wsrc);
2526     } else if (i == 2) {
2527       vextracti64x4_high(wtmp, wsrc);
2528     } else {
2529       assert(i == 0, "%d", i);
2530       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2531     }
2532     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2533     wsrc = wdst;
2534     vlen_enc = Assembler::AVX_128bit;
2535   }
2536   if (is_dst_valid) {
2537     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2538   }
2539 }
2540 
2541 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2542   switch (bt) {
2543     case T_BYTE:  pextrb(dst, src, idx); break;
2544     case T_SHORT: pextrw(dst, src, idx); break;
2545     case T_INT:   pextrd(dst, src, idx); break;
2546     case T_LONG:  pextrq(dst, src, idx); break;
2547 
2548     default:
2549       assert(false,"Should not reach here.");
2550       break;
2551   }
2552 }
2553 
2554 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2555   int esize =  type2aelembytes(typ);
2556   int elem_per_lane = 16/esize;
2557   int lane = elemindex / elem_per_lane;
2558   int eindex = elemindex % elem_per_lane;
2559 
2560   if (lane >= 2) {
2561     assert(UseAVX > 2, "required");
2562     vextractf32x4(dst, src, lane & 3);
2563     return dst;
2564   } else if (lane > 0) {
2565     assert(UseAVX > 0, "required");
2566     vextractf128(dst, src, lane);
2567     return dst;
2568   } else {
2569     return src;
2570   }
2571 }
2572 
2573 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2574   if (typ == T_BYTE) {
2575     movsbl(dst, dst);
2576   } else if (typ == T_SHORT) {
2577     movswl(dst, dst);
2578   }
2579 }
2580 
2581 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2582   int esize =  type2aelembytes(typ);
2583   int elem_per_lane = 16/esize;
2584   int eindex = elemindex % elem_per_lane;
2585   assert(is_integral_type(typ),"required");
2586 
2587   if (eindex == 0) {
2588     if (typ == T_LONG) {
2589       movq(dst, src);
2590     } else {
2591       movdl(dst, src);
2592       movsxl(typ, dst);
2593     }
2594   } else {
2595     extract(typ, dst, src, eindex);
2596     movsxl(typ, dst);
2597   }
2598 }
2599 
2600 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2601   int esize =  type2aelembytes(typ);
2602   int elem_per_lane = 16/esize;
2603   int eindex = elemindex % elem_per_lane;
2604   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2605 
2606   if (eindex == 0) {
2607     movq(dst, src);
2608   } else {
2609     if (typ == T_FLOAT) {
2610       if (UseAVX == 0) {
2611         movdqu(dst, src);
2612         shufps(dst, dst, eindex);
2613       } else {
2614         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2615       }
2616     } else {
2617       if (UseAVX == 0) {
2618         movdqu(dst, src);
2619         psrldq(dst, eindex*esize);
2620       } else {
2621         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2622       }
2623       movq(dst, dst);
2624     }
2625   }
2626   // Zero upper bits
2627   if (typ == T_FLOAT) {
2628     if (UseAVX == 0) {
2629       assert(vtmp != xnoreg, "required.");
2630       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2631       pand(dst, vtmp);
2632     } else {
2633       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2634     }
2635   }
2636 }
2637 
2638 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2639   switch(typ) {
2640     case T_BYTE:
2641     case T_BOOLEAN:
2642       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2643       break;
2644     case T_SHORT:
2645     case T_CHAR:
2646       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2647       break;
2648     case T_INT:
2649     case T_FLOAT:
2650       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2651       break;
2652     case T_LONG:
2653     case T_DOUBLE:
2654       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2655       break;
2656     default:
2657       assert(false,"Should not reach here.");
2658       break;
2659   }
2660 }
2661 
2662 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2663   assert(rscratch != noreg || always_reachable(src2), "missing");
2664 
2665   switch(typ) {
2666     case T_BOOLEAN:
2667     case T_BYTE:
2668       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2669       break;
2670     case T_CHAR:
2671     case T_SHORT:
2672       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2673       break;
2674     case T_INT:
2675     case T_FLOAT:
2676       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2677       break;
2678     case T_LONG:
2679     case T_DOUBLE:
2680       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2681       break;
2682     default:
2683       assert(false,"Should not reach here.");
2684       break;
2685   }
2686 }
2687 
2688 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2689   switch(typ) {
2690     case T_BYTE:
2691       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2692       break;
2693     case T_SHORT:
2694       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2695       break;
2696     case T_INT:
2697     case T_FLOAT:
2698       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2699       break;
2700     case T_LONG:
2701     case T_DOUBLE:
2702       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2703       break;
2704     default:
2705       assert(false,"Should not reach here.");
2706       break;
2707   }
2708 }
2709 
2710 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2711   assert(vlen_in_bytes <= 32, "");
2712   int esize = type2aelembytes(bt);
2713   if (vlen_in_bytes == 32) {
2714     assert(vtmp == xnoreg, "required.");
2715     if (esize >= 4) {
2716       vtestps(src1, src2, AVX_256bit);
2717     } else {
2718       vptest(src1, src2, AVX_256bit);
2719     }
2720     return;
2721   }
2722   if (vlen_in_bytes < 16) {
2723     // Duplicate the lower part to fill the whole register,
2724     // Don't need to do so for src2
2725     assert(vtmp != xnoreg, "required");
2726     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2727     pshufd(vtmp, src1, shuffle_imm);
2728   } else {
2729     assert(vtmp == xnoreg, "required");
2730     vtmp = src1;
2731   }
2732   if (esize >= 4 && VM_Version::supports_avx()) {
2733     vtestps(vtmp, src2, AVX_128bit);
2734   } else {
2735     ptest(vtmp, src2);
2736   }
2737 }
2738 
2739 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2740   assert(UseAVX >= 2, "required");
2741 #ifdef ASSERT
2742   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2743   bool is_bw_supported = VM_Version::supports_avx512bw();
2744   if (is_bw && !is_bw_supported) {
2745     assert(vlen_enc != Assembler::AVX_512bit, "required");
2746     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2747            "XMM register should be 0-15");
2748   }
2749 #endif // ASSERT
2750   switch (elem_bt) {
2751     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2752     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2753     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2754     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2755     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2756     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2757     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2758   }
2759 }
2760 
2761 #ifdef _LP64
2762 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2763   assert(UseAVX >= 2, "required");
2764   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2765   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2766   if ((UseAVX > 2) &&
2767       (!is_bw || VM_Version::supports_avx512bw()) &&
2768       (!is_vl || VM_Version::supports_avx512vl())) {
2769     switch (elem_bt) {
2770       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2771       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2772       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2773       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2774       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2775     }
2776   } else {
2777     assert(vlen_enc != Assembler::AVX_512bit, "required");
2778     assert((dst->encoding() < 16),"XMM register should be 0-15");
2779     switch (elem_bt) {
2780       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2781       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2782       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2783       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2784       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2785       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2786       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2787     }
2788   }
2789 }
2790 #endif
2791 
2792 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2793   switch (to_elem_bt) {
2794     case T_SHORT:
2795       vpmovsxbw(dst, src, vlen_enc);
2796       break;
2797     case T_INT:
2798       vpmovsxbd(dst, src, vlen_enc);
2799       break;
2800     case T_FLOAT:
2801       vpmovsxbd(dst, src, vlen_enc);
2802       vcvtdq2ps(dst, dst, vlen_enc);
2803       break;
2804     case T_LONG:
2805       vpmovsxbq(dst, src, vlen_enc);
2806       break;
2807     case T_DOUBLE: {
2808       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2809       vpmovsxbd(dst, src, mid_vlen_enc);
2810       vcvtdq2pd(dst, dst, vlen_enc);
2811       break;
2812     }
2813     default:
2814       fatal("Unsupported type %s", type2name(to_elem_bt));
2815       break;
2816   }
2817 }
2818 
2819 //-------------------------------------------------------------------------------------------
2820 
2821 // IndexOf for constant substrings with size >= 8 chars
2822 // which don't need to be loaded through stack.
2823 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2824                                          Register cnt1, Register cnt2,
2825                                          int int_cnt2,  Register result,
2826                                          XMMRegister vec, Register tmp,
2827                                          int ae) {
2828   ShortBranchVerifier sbv(this);
2829   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2830   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2831 
2832   // This method uses the pcmpestri instruction with bound registers
2833   //   inputs:
2834   //     xmm - substring
2835   //     rax - substring length (elements count)
2836   //     mem - scanned string
2837   //     rdx - string length (elements count)
2838   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2839   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2840   //   outputs:
2841   //     rcx - matched index in string
2842   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2843   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2844   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2845   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2846   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2847 
2848   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2849         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2850         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2851 
2852   // Note, inline_string_indexOf() generates checks:
2853   // if (substr.count > string.count) return -1;
2854   // if (substr.count == 0) return 0;
2855   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2856 
2857   // Load substring.
2858   if (ae == StrIntrinsicNode::UL) {
2859     pmovzxbw(vec, Address(str2, 0));
2860   } else {
2861     movdqu(vec, Address(str2, 0));
2862   }
2863   movl(cnt2, int_cnt2);
2864   movptr(result, str1); // string addr
2865 
2866   if (int_cnt2 > stride) {
2867     jmpb(SCAN_TO_SUBSTR);
2868 
2869     // Reload substr for rescan, this code
2870     // is executed only for large substrings (> 8 chars)
2871     bind(RELOAD_SUBSTR);
2872     if (ae == StrIntrinsicNode::UL) {
2873       pmovzxbw(vec, Address(str2, 0));
2874     } else {
2875       movdqu(vec, Address(str2, 0));
2876     }
2877     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2878 
2879     bind(RELOAD_STR);
2880     // We came here after the beginning of the substring was
2881     // matched but the rest of it was not so we need to search
2882     // again. Start from the next element after the previous match.
2883 
2884     // cnt2 is number of substring reminding elements and
2885     // cnt1 is number of string reminding elements when cmp failed.
2886     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2887     subl(cnt1, cnt2);
2888     addl(cnt1, int_cnt2);
2889     movl(cnt2, int_cnt2); // Now restore cnt2
2890 
2891     decrementl(cnt1);     // Shift to next element
2892     cmpl(cnt1, cnt2);
2893     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2894 
2895     addptr(result, (1<<scale1));
2896 
2897   } // (int_cnt2 > 8)
2898 
2899   // Scan string for start of substr in 16-byte vectors
2900   bind(SCAN_TO_SUBSTR);
2901   pcmpestri(vec, Address(result, 0), mode);
2902   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2903   subl(cnt1, stride);
2904   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2905   cmpl(cnt1, cnt2);
2906   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2907   addptr(result, 16);
2908   jmpb(SCAN_TO_SUBSTR);
2909 
2910   // Found a potential substr
2911   bind(FOUND_CANDIDATE);
2912   // Matched whole vector if first element matched (tmp(rcx) == 0).
2913   if (int_cnt2 == stride) {
2914     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2915   } else { // int_cnt2 > 8
2916     jccb(Assembler::overflow, FOUND_SUBSTR);
2917   }
2918   // After pcmpestri tmp(rcx) contains matched element index
2919   // Compute start addr of substr
2920   lea(result, Address(result, tmp, scale1));
2921 
2922   // Make sure string is still long enough
2923   subl(cnt1, tmp);
2924   cmpl(cnt1, cnt2);
2925   if (int_cnt2 == stride) {
2926     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2927   } else { // int_cnt2 > 8
2928     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2929   }
2930   // Left less then substring.
2931 
2932   bind(RET_NOT_FOUND);
2933   movl(result, -1);
2934   jmp(EXIT);
2935 
2936   if (int_cnt2 > stride) {
2937     // This code is optimized for the case when whole substring
2938     // is matched if its head is matched.
2939     bind(MATCH_SUBSTR_HEAD);
2940     pcmpestri(vec, Address(result, 0), mode);
2941     // Reload only string if does not match
2942     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2943 
2944     Label CONT_SCAN_SUBSTR;
2945     // Compare the rest of substring (> 8 chars).
2946     bind(FOUND_SUBSTR);
2947     // First 8 chars are already matched.
2948     negptr(cnt2);
2949     addptr(cnt2, stride);
2950 
2951     bind(SCAN_SUBSTR);
2952     subl(cnt1, stride);
2953     cmpl(cnt2, -stride); // Do not read beyond substring
2954     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2955     // Back-up strings to avoid reading beyond substring:
2956     // cnt1 = cnt1 - cnt2 + 8
2957     addl(cnt1, cnt2); // cnt2 is negative
2958     addl(cnt1, stride);
2959     movl(cnt2, stride); negptr(cnt2);
2960     bind(CONT_SCAN_SUBSTR);
2961     if (int_cnt2 < (int)G) {
2962       int tail_off1 = int_cnt2<<scale1;
2963       int tail_off2 = int_cnt2<<scale2;
2964       if (ae == StrIntrinsicNode::UL) {
2965         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2966       } else {
2967         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2968       }
2969       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2970     } else {
2971       // calculate index in register to avoid integer overflow (int_cnt2*2)
2972       movl(tmp, int_cnt2);
2973       addptr(tmp, cnt2);
2974       if (ae == StrIntrinsicNode::UL) {
2975         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2976       } else {
2977         movdqu(vec, Address(str2, tmp, scale2, 0));
2978       }
2979       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2980     }
2981     // Need to reload strings pointers if not matched whole vector
2982     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2983     addptr(cnt2, stride);
2984     jcc(Assembler::negative, SCAN_SUBSTR);
2985     // Fall through if found full substring
2986 
2987   } // (int_cnt2 > 8)
2988 
2989   bind(RET_FOUND);
2990   // Found result if we matched full small substring.
2991   // Compute substr offset
2992   subptr(result, str1);
2993   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2994     shrl(result, 1); // index
2995   }
2996   bind(EXIT);
2997 
2998 } // string_indexofC8
2999 
3000 // Small strings are loaded through stack if they cross page boundary.
3001 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
3002                                        Register cnt1, Register cnt2,
3003                                        int int_cnt2,  Register result,
3004                                        XMMRegister vec, Register tmp,
3005                                        int ae) {
3006   ShortBranchVerifier sbv(this);
3007   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3008   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
3009 
3010   //
3011   // int_cnt2 is length of small (< 8 chars) constant substring
3012   // or (-1) for non constant substring in which case its length
3013   // is in cnt2 register.
3014   //
3015   // Note, inline_string_indexOf() generates checks:
3016   // if (substr.count > string.count) return -1;
3017   // if (substr.count == 0) return 0;
3018   //
3019   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
3020   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
3021   // This method uses the pcmpestri instruction with bound registers
3022   //   inputs:
3023   //     xmm - substring
3024   //     rax - substring length (elements count)
3025   //     mem - scanned string
3026   //     rdx - string length (elements count)
3027   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
3028   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
3029   //   outputs:
3030   //     rcx - matched index in string
3031   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3032   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
3033   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
3034   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
3035 
3036   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
3037         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
3038         FOUND_CANDIDATE;
3039 
3040   { //========================================================
3041     // We don't know where these strings are located
3042     // and we can't read beyond them. Load them through stack.
3043     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
3044 
3045     movptr(tmp, rsp); // save old SP
3046 
3047     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
3048       if (int_cnt2 == (1>>scale2)) { // One byte
3049         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
3050         load_unsigned_byte(result, Address(str2, 0));
3051         movdl(vec, result); // move 32 bits
3052       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
3053         // Not enough header space in 32-bit VM: 12+3 = 15.
3054         movl(result, Address(str2, -1));
3055         shrl(result, 8);
3056         movdl(vec, result); // move 32 bits
3057       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
3058         load_unsigned_short(result, Address(str2, 0));
3059         movdl(vec, result); // move 32 bits
3060       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
3061         movdl(vec, Address(str2, 0)); // move 32 bits
3062       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
3063         movq(vec, Address(str2, 0));  // move 64 bits
3064       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3065         // Array header size is 12 bytes in 32-bit VM
3066         // + 6 bytes for 3 chars == 18 bytes,
3067         // enough space to load vec and shift.
3068         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3069         if (ae == StrIntrinsicNode::UL) {
3070           int tail_off = int_cnt2-8;
3071           pmovzxbw(vec, Address(str2, tail_off));
3072           psrldq(vec, -2*tail_off);
3073         }
3074         else {
3075           int tail_off = int_cnt2*(1<<scale2);
3076           movdqu(vec, Address(str2, tail_off-16));
3077           psrldq(vec, 16-tail_off);
3078         }
3079       }
3080     } else { // not constant substring
3081       cmpl(cnt2, stride);
3082       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3083 
3084       // We can read beyond string if srt+16 does not cross page boundary
3085       // since heaps are aligned and mapped by pages.
3086       assert(os::vm_page_size() < (int)G, "default page should be small");
3087       movl(result, str2); // We need only low 32 bits
3088       andl(result, ((int)os::vm_page_size()-1));
3089       cmpl(result, ((int)os::vm_page_size()-16));
3090       jccb(Assembler::belowEqual, CHECK_STR);
3091 
3092       // Move small strings to stack to allow load 16 bytes into vec.
3093       subptr(rsp, 16);
3094       int stk_offset = wordSize-(1<<scale2);
3095       push(cnt2);
3096 
3097       bind(COPY_SUBSTR);
3098       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3099         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3100         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3101       } else if (ae == StrIntrinsicNode::UU) {
3102         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3103         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3104       }
3105       decrement(cnt2);
3106       jccb(Assembler::notZero, COPY_SUBSTR);
3107 
3108       pop(cnt2);
3109       movptr(str2, rsp);  // New substring address
3110     } // non constant
3111 
3112     bind(CHECK_STR);
3113     cmpl(cnt1, stride);
3114     jccb(Assembler::aboveEqual, BIG_STRINGS);
3115 
3116     // Check cross page boundary.
3117     movl(result, str1); // We need only low 32 bits
3118     andl(result, ((int)os::vm_page_size()-1));
3119     cmpl(result, ((int)os::vm_page_size()-16));
3120     jccb(Assembler::belowEqual, BIG_STRINGS);
3121 
3122     subptr(rsp, 16);
3123     int stk_offset = -(1<<scale1);
3124     if (int_cnt2 < 0) { // not constant
3125       push(cnt2);
3126       stk_offset += wordSize;
3127     }
3128     movl(cnt2, cnt1);
3129 
3130     bind(COPY_STR);
3131     if (ae == StrIntrinsicNode::LL) {
3132       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3133       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3134     } else {
3135       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3136       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3137     }
3138     decrement(cnt2);
3139     jccb(Assembler::notZero, COPY_STR);
3140 
3141     if (int_cnt2 < 0) { // not constant
3142       pop(cnt2);
3143     }
3144     movptr(str1, rsp);  // New string address
3145 
3146     bind(BIG_STRINGS);
3147     // Load substring.
3148     if (int_cnt2 < 0) { // -1
3149       if (ae == StrIntrinsicNode::UL) {
3150         pmovzxbw(vec, Address(str2, 0));
3151       } else {
3152         movdqu(vec, Address(str2, 0));
3153       }
3154       push(cnt2);       // substr count
3155       push(str2);       // substr addr
3156       push(str1);       // string addr
3157     } else {
3158       // Small (< 8 chars) constant substrings are loaded already.
3159       movl(cnt2, int_cnt2);
3160     }
3161     push(tmp);  // original SP
3162 
3163   } // Finished loading
3164 
3165   //========================================================
3166   // Start search
3167   //
3168 
3169   movptr(result, str1); // string addr
3170 
3171   if (int_cnt2  < 0) {  // Only for non constant substring
3172     jmpb(SCAN_TO_SUBSTR);
3173 
3174     // SP saved at sp+0
3175     // String saved at sp+1*wordSize
3176     // Substr saved at sp+2*wordSize
3177     // Substr count saved at sp+3*wordSize
3178 
3179     // Reload substr for rescan, this code
3180     // is executed only for large substrings (> 8 chars)
3181     bind(RELOAD_SUBSTR);
3182     movptr(str2, Address(rsp, 2*wordSize));
3183     movl(cnt2, Address(rsp, 3*wordSize));
3184     if (ae == StrIntrinsicNode::UL) {
3185       pmovzxbw(vec, Address(str2, 0));
3186     } else {
3187       movdqu(vec, Address(str2, 0));
3188     }
3189     // We came here after the beginning of the substring was
3190     // matched but the rest of it was not so we need to search
3191     // again. Start from the next element after the previous match.
3192     subptr(str1, result); // Restore counter
3193     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3194       shrl(str1, 1);
3195     }
3196     addl(cnt1, str1);
3197     decrementl(cnt1);   // Shift to next element
3198     cmpl(cnt1, cnt2);
3199     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3200 
3201     addptr(result, (1<<scale1));
3202   } // non constant
3203 
3204   // Scan string for start of substr in 16-byte vectors
3205   bind(SCAN_TO_SUBSTR);
3206   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3207   pcmpestri(vec, Address(result, 0), mode);
3208   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3209   subl(cnt1, stride);
3210   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3211   cmpl(cnt1, cnt2);
3212   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3213   addptr(result, 16);
3214 
3215   bind(ADJUST_STR);
3216   cmpl(cnt1, stride); // Do not read beyond string
3217   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3218   // Back-up string to avoid reading beyond string.
3219   lea(result, Address(result, cnt1, scale1, -16));
3220   movl(cnt1, stride);
3221   jmpb(SCAN_TO_SUBSTR);
3222 
3223   // Found a potential substr
3224   bind(FOUND_CANDIDATE);
3225   // After pcmpestri tmp(rcx) contains matched element index
3226 
3227   // Make sure string is still long enough
3228   subl(cnt1, tmp);
3229   cmpl(cnt1, cnt2);
3230   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3231   // Left less then substring.
3232 
3233   bind(RET_NOT_FOUND);
3234   movl(result, -1);
3235   jmp(CLEANUP);
3236 
3237   bind(FOUND_SUBSTR);
3238   // Compute start addr of substr
3239   lea(result, Address(result, tmp, scale1));
3240   if (int_cnt2 > 0) { // Constant substring
3241     // Repeat search for small substring (< 8 chars)
3242     // from new point without reloading substring.
3243     // Have to check that we don't read beyond string.
3244     cmpl(tmp, stride-int_cnt2);
3245     jccb(Assembler::greater, ADJUST_STR);
3246     // Fall through if matched whole substring.
3247   } else { // non constant
3248     assert(int_cnt2 == -1, "should be != 0");
3249 
3250     addl(tmp, cnt2);
3251     // Found result if we matched whole substring.
3252     cmpl(tmp, stride);
3253     jcc(Assembler::lessEqual, RET_FOUND);
3254 
3255     // Repeat search for small substring (<= 8 chars)
3256     // from new point 'str1' without reloading substring.
3257     cmpl(cnt2, stride);
3258     // Have to check that we don't read beyond string.
3259     jccb(Assembler::lessEqual, ADJUST_STR);
3260 
3261     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3262     // Compare the rest of substring (> 8 chars).
3263     movptr(str1, result);
3264 
3265     cmpl(tmp, cnt2);
3266     // First 8 chars are already matched.
3267     jccb(Assembler::equal, CHECK_NEXT);
3268 
3269     bind(SCAN_SUBSTR);
3270     pcmpestri(vec, Address(str1, 0), mode);
3271     // Need to reload strings pointers if not matched whole vector
3272     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3273 
3274     bind(CHECK_NEXT);
3275     subl(cnt2, stride);
3276     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3277     addptr(str1, 16);
3278     if (ae == StrIntrinsicNode::UL) {
3279       addptr(str2, 8);
3280     } else {
3281       addptr(str2, 16);
3282     }
3283     subl(cnt1, stride);
3284     cmpl(cnt2, stride); // Do not read beyond substring
3285     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3286     // Back-up strings to avoid reading beyond substring.
3287 
3288     if (ae == StrIntrinsicNode::UL) {
3289       lea(str2, Address(str2, cnt2, scale2, -8));
3290       lea(str1, Address(str1, cnt2, scale1, -16));
3291     } else {
3292       lea(str2, Address(str2, cnt2, scale2, -16));
3293       lea(str1, Address(str1, cnt2, scale1, -16));
3294     }
3295     subl(cnt1, cnt2);
3296     movl(cnt2, stride);
3297     addl(cnt1, stride);
3298     bind(CONT_SCAN_SUBSTR);
3299     if (ae == StrIntrinsicNode::UL) {
3300       pmovzxbw(vec, Address(str2, 0));
3301     } else {
3302       movdqu(vec, Address(str2, 0));
3303     }
3304     jmp(SCAN_SUBSTR);
3305 
3306     bind(RET_FOUND_LONG);
3307     movptr(str1, Address(rsp, wordSize));
3308   } // non constant
3309 
3310   bind(RET_FOUND);
3311   // Compute substr offset
3312   subptr(result, str1);
3313   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3314     shrl(result, 1); // index
3315   }
3316   bind(CLEANUP);
3317   pop(rsp); // restore SP
3318 
3319 } // string_indexof
3320 
3321 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3322                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3323   ShortBranchVerifier sbv(this);
3324   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3325 
3326   int stride = 8;
3327 
3328   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3329         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3330         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3331         FOUND_SEQ_CHAR, DONE_LABEL;
3332 
3333   movptr(result, str1);
3334   if (UseAVX >= 2) {
3335     cmpl(cnt1, stride);
3336     jcc(Assembler::less, SCAN_TO_CHAR);
3337     cmpl(cnt1, 2*stride);
3338     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3339     movdl(vec1, ch);
3340     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3341     vpxor(vec2, vec2);
3342     movl(tmp, cnt1);
3343     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3344     andl(cnt1,0x0000000F);  //tail count (in chars)
3345 
3346     bind(SCAN_TO_16_CHAR_LOOP);
3347     vmovdqu(vec3, Address(result, 0));
3348     vpcmpeqw(vec3, vec3, vec1, 1);
3349     vptest(vec2, vec3);
3350     jcc(Assembler::carryClear, FOUND_CHAR);
3351     addptr(result, 32);
3352     subl(tmp, 2*stride);
3353     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3354     jmp(SCAN_TO_8_CHAR);
3355     bind(SCAN_TO_8_CHAR_INIT);
3356     movdl(vec1, ch);
3357     pshuflw(vec1, vec1, 0x00);
3358     pshufd(vec1, vec1, 0);
3359     pxor(vec2, vec2);
3360   }
3361   bind(SCAN_TO_8_CHAR);
3362   cmpl(cnt1, stride);
3363   jcc(Assembler::less, SCAN_TO_CHAR);
3364   if (UseAVX < 2) {
3365     movdl(vec1, ch);
3366     pshuflw(vec1, vec1, 0x00);
3367     pshufd(vec1, vec1, 0);
3368     pxor(vec2, vec2);
3369   }
3370   movl(tmp, cnt1);
3371   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3372   andl(cnt1,0x00000007);  //tail count (in chars)
3373 
3374   bind(SCAN_TO_8_CHAR_LOOP);
3375   movdqu(vec3, Address(result, 0));
3376   pcmpeqw(vec3, vec1);
3377   ptest(vec2, vec3);
3378   jcc(Assembler::carryClear, FOUND_CHAR);
3379   addptr(result, 16);
3380   subl(tmp, stride);
3381   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3382   bind(SCAN_TO_CHAR);
3383   testl(cnt1, cnt1);
3384   jcc(Assembler::zero, RET_NOT_FOUND);
3385   bind(SCAN_TO_CHAR_LOOP);
3386   load_unsigned_short(tmp, Address(result, 0));
3387   cmpl(ch, tmp);
3388   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3389   addptr(result, 2);
3390   subl(cnt1, 1);
3391   jccb(Assembler::zero, RET_NOT_FOUND);
3392   jmp(SCAN_TO_CHAR_LOOP);
3393 
3394   bind(RET_NOT_FOUND);
3395   movl(result, -1);
3396   jmpb(DONE_LABEL);
3397 
3398   bind(FOUND_CHAR);
3399   if (UseAVX >= 2) {
3400     vpmovmskb(tmp, vec3);
3401   } else {
3402     pmovmskb(tmp, vec3);
3403   }
3404   bsfl(ch, tmp);
3405   addptr(result, ch);
3406 
3407   bind(FOUND_SEQ_CHAR);
3408   subptr(result, str1);
3409   shrl(result, 1);
3410 
3411   bind(DONE_LABEL);
3412 } // string_indexof_char
3413 
3414 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3415                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3416   ShortBranchVerifier sbv(this);
3417   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3418 
3419   int stride = 16;
3420 
3421   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3422         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3423         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3424         FOUND_SEQ_CHAR, DONE_LABEL;
3425 
3426   movptr(result, str1);
3427   if (UseAVX >= 2) {
3428     cmpl(cnt1, stride);
3429     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3430     cmpl(cnt1, stride*2);
3431     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3432     movdl(vec1, ch);
3433     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3434     vpxor(vec2, vec2);
3435     movl(tmp, cnt1);
3436     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3437     andl(cnt1,0x0000001F);  //tail count (in chars)
3438 
3439     bind(SCAN_TO_32_CHAR_LOOP);
3440     vmovdqu(vec3, Address(result, 0));
3441     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3442     vptest(vec2, vec3);
3443     jcc(Assembler::carryClear, FOUND_CHAR);
3444     addptr(result, 32);
3445     subl(tmp, stride*2);
3446     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3447     jmp(SCAN_TO_16_CHAR);
3448 
3449     bind(SCAN_TO_16_CHAR_INIT);
3450     movdl(vec1, ch);
3451     pxor(vec2, vec2);
3452     pshufb(vec1, vec2);
3453   }
3454 
3455   bind(SCAN_TO_16_CHAR);
3456   cmpl(cnt1, stride);
3457   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3458   if (UseAVX < 2) {
3459     movdl(vec1, ch);
3460     pxor(vec2, vec2);
3461     pshufb(vec1, vec2);
3462   }
3463   movl(tmp, cnt1);
3464   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3465   andl(cnt1,0x0000000F);  //tail count (in bytes)
3466 
3467   bind(SCAN_TO_16_CHAR_LOOP);
3468   movdqu(vec3, Address(result, 0));
3469   pcmpeqb(vec3, vec1);
3470   ptest(vec2, vec3);
3471   jcc(Assembler::carryClear, FOUND_CHAR);
3472   addptr(result, 16);
3473   subl(tmp, stride);
3474   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3475 
3476   bind(SCAN_TO_CHAR_INIT);
3477   testl(cnt1, cnt1);
3478   jcc(Assembler::zero, RET_NOT_FOUND);
3479   bind(SCAN_TO_CHAR_LOOP);
3480   load_unsigned_byte(tmp, Address(result, 0));
3481   cmpl(ch, tmp);
3482   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3483   addptr(result, 1);
3484   subl(cnt1, 1);
3485   jccb(Assembler::zero, RET_NOT_FOUND);
3486   jmp(SCAN_TO_CHAR_LOOP);
3487 
3488   bind(RET_NOT_FOUND);
3489   movl(result, -1);
3490   jmpb(DONE_LABEL);
3491 
3492   bind(FOUND_CHAR);
3493   if (UseAVX >= 2) {
3494     vpmovmskb(tmp, vec3);
3495   } else {
3496     pmovmskb(tmp, vec3);
3497   }
3498   bsfl(ch, tmp);
3499   addptr(result, ch);
3500 
3501   bind(FOUND_SEQ_CHAR);
3502   subptr(result, str1);
3503 
3504   bind(DONE_LABEL);
3505 } // stringL_indexof_char
3506 
3507 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3508   switch (eltype) {
3509   case T_BOOLEAN: return sizeof(jboolean);
3510   case T_BYTE:  return sizeof(jbyte);
3511   case T_SHORT: return sizeof(jshort);
3512   case T_CHAR:  return sizeof(jchar);
3513   case T_INT:   return sizeof(jint);
3514   default:
3515     ShouldNotReachHere();
3516     return -1;
3517   }
3518 }
3519 
3520 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3521   switch (eltype) {
3522   // T_BOOLEAN used as surrogate for unsigned byte
3523   case T_BOOLEAN: movzbl(dst, src);   break;
3524   case T_BYTE:    movsbl(dst, src);   break;
3525   case T_SHORT:   movswl(dst, src);   break;
3526   case T_CHAR:    movzwl(dst, src);   break;
3527   case T_INT:     movl(dst, src);     break;
3528   default:
3529     ShouldNotReachHere();
3530   }
3531 }
3532 
3533 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3534   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3535 }
3536 
3537 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3538   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3539 }
3540 
3541 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3542   const int vlen = Assembler::AVX_256bit;
3543   switch (eltype) {
3544   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3545   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3546   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3547   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3548   case T_INT:
3549     // do nothing
3550     break;
3551   default:
3552     ShouldNotReachHere();
3553   }
3554 }
3555 
3556 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3557                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3558                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3559                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3560                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3561                                         BasicType eltype) {
3562   ShortBranchVerifier sbv(this);
3563   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3564   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3565   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3566 
3567   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3568         SHORT_UNROLLED_LOOP_EXIT,
3569         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3570         UNROLLED_VECTOR_LOOP_BEGIN,
3571         END;
3572   switch (eltype) {
3573   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3574   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3575   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3576   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3577   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3578   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3579   }
3580 
3581   // For "renaming" for readibility of the code
3582   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3583                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3584                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3585 
3586   const int elsize = arrays_hashcode_elsize(eltype);
3587 
3588   /*
3589     if (cnt1 >= 2) {
3590       if (cnt1 >= 32) {
3591         UNROLLED VECTOR LOOP
3592       }
3593       UNROLLED SCALAR LOOP
3594     }
3595     SINGLE SCALAR
3596    */
3597 
3598   cmpl(cnt1, 32);
3599   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3600 
3601   // cnt1 >= 32 && generate_vectorized_loop
3602   xorl(index, index);
3603 
3604   // vresult = IntVector.zero(I256);
3605   for (int idx = 0; idx < 4; idx++) {
3606     vpxor(vresult[idx], vresult[idx]);
3607   }
3608   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3609   Register bound = tmp2;
3610   Register next = tmp3;
3611   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3612   movl(next, Address(tmp2, 0));
3613   movdl(vnext, next);
3614   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3615 
3616   // index = 0;
3617   // bound = cnt1 & ~(32 - 1);
3618   movl(bound, cnt1);
3619   andl(bound, ~(32 - 1));
3620   // for (; index < bound; index += 32) {
3621   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3622   // result *= next;
3623   imull(result, next);
3624   // loop fission to upfront the cost of fetching from memory, OOO execution
3625   // can then hopefully do a better job of prefetching
3626   for (int idx = 0; idx < 4; idx++) {
3627     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3628   }
3629   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3630   for (int idx = 0; idx < 4; idx++) {
3631     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3632     arrays_hashcode_elvcast(vtmp[idx], eltype);
3633     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3634   }
3635   // index += 32;
3636   addl(index, 32);
3637   // index < bound;
3638   cmpl(index, bound);
3639   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3640   // }
3641 
3642   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3643   subl(cnt1, bound);
3644   // release bound
3645 
3646   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3647   for (int idx = 0; idx < 4; idx++) {
3648     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3649     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3650     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3651   }
3652   // result += vresult.reduceLanes(ADD);
3653   for (int idx = 0; idx < 4; idx++) {
3654     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3655   }
3656 
3657   // } else if (cnt1 < 32) {
3658 
3659   bind(SHORT_UNROLLED_BEGIN);
3660   // int i = 1;
3661   movl(index, 1);
3662   cmpl(index, cnt1);
3663   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3664 
3665   // for (; i < cnt1 ; i += 2) {
3666   bind(SHORT_UNROLLED_LOOP_BEGIN);
3667   movl(tmp3, 961);
3668   imull(result, tmp3);
3669   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3670   movl(tmp3, tmp2);
3671   shll(tmp3, 5);
3672   subl(tmp3, tmp2);
3673   addl(result, tmp3);
3674   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3675   addl(result, tmp3);
3676   addl(index, 2);
3677   cmpl(index, cnt1);
3678   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3679 
3680   // }
3681   // if (i >= cnt1) {
3682   bind(SHORT_UNROLLED_LOOP_EXIT);
3683   jccb(Assembler::greater, END);
3684   movl(tmp2, result);
3685   shll(result, 5);
3686   subl(result, tmp2);
3687   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3688   addl(result, tmp3);
3689   // }
3690   bind(END);
3691 
3692   BLOCK_COMMENT("} // arrays_hashcode");
3693 
3694 } // arrays_hashcode
3695 
3696 // helper function for string_compare
3697 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3698                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3699                                            Address::ScaleFactor scale2, Register index, int ae) {
3700   if (ae == StrIntrinsicNode::LL) {
3701     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3702     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3703   } else if (ae == StrIntrinsicNode::UU) {
3704     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3705     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3706   } else {
3707     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3708     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3709   }
3710 }
3711 
3712 // Compare strings, used for char[] and byte[].
3713 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3714                                        Register cnt1, Register cnt2, Register result,
3715                                        XMMRegister vec1, int ae, KRegister mask) {
3716   ShortBranchVerifier sbv(this);
3717   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3718   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3719   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3720   int stride2x2 = 0x40;
3721   Address::ScaleFactor scale = Address::no_scale;
3722   Address::ScaleFactor scale1 = Address::no_scale;
3723   Address::ScaleFactor scale2 = Address::no_scale;
3724 
3725   if (ae != StrIntrinsicNode::LL) {
3726     stride2x2 = 0x20;
3727   }
3728 
3729   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3730     shrl(cnt2, 1);
3731   }
3732   // Compute the minimum of the string lengths and the
3733   // difference of the string lengths (stack).
3734   // Do the conditional move stuff
3735   movl(result, cnt1);
3736   subl(cnt1, cnt2);
3737   push(cnt1);
3738   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3739 
3740   // Is the minimum length zero?
3741   testl(cnt2, cnt2);
3742   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3743   if (ae == StrIntrinsicNode::LL) {
3744     // Load first bytes
3745     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3746     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3747   } else if (ae == StrIntrinsicNode::UU) {
3748     // Load first characters
3749     load_unsigned_short(result, Address(str1, 0));
3750     load_unsigned_short(cnt1, Address(str2, 0));
3751   } else {
3752     load_unsigned_byte(result, Address(str1, 0));
3753     load_unsigned_short(cnt1, Address(str2, 0));
3754   }
3755   subl(result, cnt1);
3756   jcc(Assembler::notZero,  POP_LABEL);
3757 
3758   if (ae == StrIntrinsicNode::UU) {
3759     // Divide length by 2 to get number of chars
3760     shrl(cnt2, 1);
3761   }
3762   cmpl(cnt2, 1);
3763   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3764 
3765   // Check if the strings start at the same location and setup scale and stride
3766   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3767     cmpptr(str1, str2);
3768     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3769     if (ae == StrIntrinsicNode::LL) {
3770       scale = Address::times_1;
3771       stride = 16;
3772     } else {
3773       scale = Address::times_2;
3774       stride = 8;
3775     }
3776   } else {
3777     scale1 = Address::times_1;
3778     scale2 = Address::times_2;
3779     // scale not used
3780     stride = 8;
3781   }
3782 
3783   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3784     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3785     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3786     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3787     Label COMPARE_TAIL_LONG;
3788     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3789 
3790     int pcmpmask = 0x19;
3791     if (ae == StrIntrinsicNode::LL) {
3792       pcmpmask &= ~0x01;
3793     }
3794 
3795     // Setup to compare 16-chars (32-bytes) vectors,
3796     // start from first character again because it has aligned address.
3797     if (ae == StrIntrinsicNode::LL) {
3798       stride2 = 32;
3799     } else {
3800       stride2 = 16;
3801     }
3802     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3803       adr_stride = stride << scale;
3804     } else {
3805       adr_stride1 = 8;  //stride << scale1;
3806       adr_stride2 = 16; //stride << scale2;
3807     }
3808 
3809     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3810     // rax and rdx are used by pcmpestri as elements counters
3811     movl(result, cnt2);
3812     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3813     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3814 
3815     // fast path : compare first 2 8-char vectors.
3816     bind(COMPARE_16_CHARS);
3817     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3818       movdqu(vec1, Address(str1, 0));
3819     } else {
3820       pmovzxbw(vec1, Address(str1, 0));
3821     }
3822     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3823     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3824 
3825     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3826       movdqu(vec1, Address(str1, adr_stride));
3827       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3828     } else {
3829       pmovzxbw(vec1, Address(str1, adr_stride1));
3830       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3831     }
3832     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3833     addl(cnt1, stride);
3834 
3835     // Compare the characters at index in cnt1
3836     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3837     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3838     subl(result, cnt2);
3839     jmp(POP_LABEL);
3840 
3841     // Setup the registers to start vector comparison loop
3842     bind(COMPARE_WIDE_VECTORS);
3843     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3844       lea(str1, Address(str1, result, scale));
3845       lea(str2, Address(str2, result, scale));
3846     } else {
3847       lea(str1, Address(str1, result, scale1));
3848       lea(str2, Address(str2, result, scale2));
3849     }
3850     subl(result, stride2);
3851     subl(cnt2, stride2);
3852     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3853     negptr(result);
3854 
3855     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3856     bind(COMPARE_WIDE_VECTORS_LOOP);
3857 
3858 #ifdef _LP64
3859     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3860       cmpl(cnt2, stride2x2);
3861       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3862       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3863       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3864 
3865       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3866       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3867         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3868         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3869       } else {
3870         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3871         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3872       }
3873       kortestql(mask, mask);
3874       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3875       addptr(result, stride2x2);  // update since we already compared at this addr
3876       subl(cnt2, stride2x2);      // and sub the size too
3877       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3878 
3879       vpxor(vec1, vec1);
3880       jmpb(COMPARE_WIDE_TAIL);
3881     }//if (VM_Version::supports_avx512vlbw())
3882 #endif // _LP64
3883 
3884 
3885     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3886     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3887       vmovdqu(vec1, Address(str1, result, scale));
3888       vpxor(vec1, Address(str2, result, scale));
3889     } else {
3890       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3891       vpxor(vec1, Address(str2, result, scale2));
3892     }
3893     vptest(vec1, vec1);
3894     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3895     addptr(result, stride2);
3896     subl(cnt2, stride2);
3897     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3898     // clean upper bits of YMM registers
3899     vpxor(vec1, vec1);
3900 
3901     // compare wide vectors tail
3902     bind(COMPARE_WIDE_TAIL);
3903     testptr(result, result);
3904     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3905 
3906     movl(result, stride2);
3907     movl(cnt2, result);
3908     negptr(result);
3909     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3910 
3911     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3912     bind(VECTOR_NOT_EQUAL);
3913     // clean upper bits of YMM registers
3914     vpxor(vec1, vec1);
3915     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3916       lea(str1, Address(str1, result, scale));
3917       lea(str2, Address(str2, result, scale));
3918     } else {
3919       lea(str1, Address(str1, result, scale1));
3920       lea(str2, Address(str2, result, scale2));
3921     }
3922     jmp(COMPARE_16_CHARS);
3923 
3924     // Compare tail chars, length between 1 to 15 chars
3925     bind(COMPARE_TAIL_LONG);
3926     movl(cnt2, result);
3927     cmpl(cnt2, stride);
3928     jcc(Assembler::less, COMPARE_SMALL_STR);
3929 
3930     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3931       movdqu(vec1, Address(str1, 0));
3932     } else {
3933       pmovzxbw(vec1, Address(str1, 0));
3934     }
3935     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3936     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3937     subptr(cnt2, stride);
3938     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3939     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3940       lea(str1, Address(str1, result, scale));
3941       lea(str2, Address(str2, result, scale));
3942     } else {
3943       lea(str1, Address(str1, result, scale1));
3944       lea(str2, Address(str2, result, scale2));
3945     }
3946     negptr(cnt2);
3947     jmpb(WHILE_HEAD_LABEL);
3948 
3949     bind(COMPARE_SMALL_STR);
3950   } else if (UseSSE42Intrinsics) {
3951     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3952     int pcmpmask = 0x19;
3953     // Setup to compare 8-char (16-byte) vectors,
3954     // start from first character again because it has aligned address.
3955     movl(result, cnt2);
3956     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3957     if (ae == StrIntrinsicNode::LL) {
3958       pcmpmask &= ~0x01;
3959     }
3960     jcc(Assembler::zero, COMPARE_TAIL);
3961     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3962       lea(str1, Address(str1, result, scale));
3963       lea(str2, Address(str2, result, scale));
3964     } else {
3965       lea(str1, Address(str1, result, scale1));
3966       lea(str2, Address(str2, result, scale2));
3967     }
3968     negptr(result);
3969 
3970     // pcmpestri
3971     //   inputs:
3972     //     vec1- substring
3973     //     rax - negative string length (elements count)
3974     //     mem - scanned string
3975     //     rdx - string length (elements count)
3976     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3977     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3978     //   outputs:
3979     //     rcx - first mismatched element index
3980     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3981 
3982     bind(COMPARE_WIDE_VECTORS);
3983     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3984       movdqu(vec1, Address(str1, result, scale));
3985       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3986     } else {
3987       pmovzxbw(vec1, Address(str1, result, scale1));
3988       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3989     }
3990     // After pcmpestri cnt1(rcx) contains mismatched element index
3991 
3992     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3993     addptr(result, stride);
3994     subptr(cnt2, stride);
3995     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3996 
3997     // compare wide vectors tail
3998     testptr(result, result);
3999     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
4000 
4001     movl(cnt2, stride);
4002     movl(result, stride);
4003     negptr(result);
4004     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4005       movdqu(vec1, Address(str1, result, scale));
4006       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
4007     } else {
4008       pmovzxbw(vec1, Address(str1, result, scale1));
4009       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
4010     }
4011     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
4012 
4013     // Mismatched characters in the vectors
4014     bind(VECTOR_NOT_EQUAL);
4015     addptr(cnt1, result);
4016     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
4017     subl(result, cnt2);
4018     jmpb(POP_LABEL);
4019 
4020     bind(COMPARE_TAIL); // limit is zero
4021     movl(cnt2, result);
4022     // Fallthru to tail compare
4023   }
4024   // Shift str2 and str1 to the end of the arrays, negate min
4025   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4026     lea(str1, Address(str1, cnt2, scale));
4027     lea(str2, Address(str2, cnt2, scale));
4028   } else {
4029     lea(str1, Address(str1, cnt2, scale1));
4030     lea(str2, Address(str2, cnt2, scale2));
4031   }
4032   decrementl(cnt2);  // first character was compared already
4033   negptr(cnt2);
4034 
4035   // Compare the rest of the elements
4036   bind(WHILE_HEAD_LABEL);
4037   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
4038   subl(result, cnt1);
4039   jccb(Assembler::notZero, POP_LABEL);
4040   increment(cnt2);
4041   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
4042 
4043   // Strings are equal up to min length.  Return the length difference.
4044   bind(LENGTH_DIFF_LABEL);
4045   pop(result);
4046   if (ae == StrIntrinsicNode::UU) {
4047     // Divide diff by 2 to get number of chars
4048     sarl(result, 1);
4049   }
4050   jmpb(DONE_LABEL);
4051 
4052 #ifdef _LP64
4053   if (VM_Version::supports_avx512vlbw()) {
4054 
4055     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
4056 
4057     kmovql(cnt1, mask);
4058     notq(cnt1);
4059     bsfq(cnt2, cnt1);
4060     if (ae != StrIntrinsicNode::LL) {
4061       // Divide diff by 2 to get number of chars
4062       sarl(cnt2, 1);
4063     }
4064     addq(result, cnt2);
4065     if (ae == StrIntrinsicNode::LL) {
4066       load_unsigned_byte(cnt1, Address(str2, result));
4067       load_unsigned_byte(result, Address(str1, result));
4068     } else if (ae == StrIntrinsicNode::UU) {
4069       load_unsigned_short(cnt1, Address(str2, result, scale));
4070       load_unsigned_short(result, Address(str1, result, scale));
4071     } else {
4072       load_unsigned_short(cnt1, Address(str2, result, scale2));
4073       load_unsigned_byte(result, Address(str1, result, scale1));
4074     }
4075     subl(result, cnt1);
4076     jmpb(POP_LABEL);
4077   }//if (VM_Version::supports_avx512vlbw())
4078 #endif // _LP64
4079 
4080   // Discard the stored length difference
4081   bind(POP_LABEL);
4082   pop(cnt1);
4083 
4084   // That's it
4085   bind(DONE_LABEL);
4086   if(ae == StrIntrinsicNode::UL) {
4087     negl(result);
4088   }
4089 
4090 }
4091 
4092 // Search for Non-ASCII character (Negative byte value) in a byte array,
4093 // return the index of the first such character, otherwise the length
4094 // of the array segment searched.
4095 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4096 //   @IntrinsicCandidate
4097 //   public static int countPositives(byte[] ba, int off, int len) {
4098 //     for (int i = off; i < off + len; i++) {
4099 //       if (ba[i] < 0) {
4100 //         return i - off;
4101 //       }
4102 //     }
4103 //     return len;
4104 //   }
4105 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4106   Register result, Register tmp1,
4107   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4108   // rsi: byte array
4109   // rcx: len
4110   // rax: result
4111   ShortBranchVerifier sbv(this);
4112   assert_different_registers(ary1, len, result, tmp1);
4113   assert_different_registers(vec1, vec2);
4114   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4115 
4116   movl(result, len); // copy
4117   // len == 0
4118   testl(len, len);
4119   jcc(Assembler::zero, DONE);
4120 
4121   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4122     VM_Version::supports_avx512vlbw() &&
4123     VM_Version::supports_bmi2()) {
4124 
4125     Label test_64_loop, test_tail, BREAK_LOOP;
4126     movl(tmp1, len);
4127     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4128 
4129     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4130     andl(len,  0xffffffc0); // vector count (in chars)
4131     jccb(Assembler::zero, test_tail);
4132 
4133     lea(ary1, Address(ary1, len, Address::times_1));
4134     negptr(len);
4135 
4136     bind(test_64_loop);
4137     // Check whether our 64 elements of size byte contain negatives
4138     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4139     kortestql(mask1, mask1);
4140     jcc(Assembler::notZero, BREAK_LOOP);
4141 
4142     addptr(len, 64);
4143     jccb(Assembler::notZero, test_64_loop);
4144 
4145     bind(test_tail);
4146     // bail out when there is nothing to be done
4147     testl(tmp1, -1);
4148     jcc(Assembler::zero, DONE);
4149 
4150 
4151     // check the tail for absense of negatives
4152     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4153 #ifdef _LP64
4154     {
4155       Register tmp3_aliased = len;
4156       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4157       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4158       notq(tmp3_aliased);
4159       kmovql(mask2, tmp3_aliased);
4160     }
4161 #else
4162     Label k_init;
4163     jmp(k_init);
4164 
4165     // We could not read 64-bits from a general purpose register thus we move
4166     // data required to compose 64 1's to the instruction stream
4167     // We emit 64 byte wide series of elements from 0..63 which later on would
4168     // be used as a compare targets with tail count contained in tmp1 register.
4169     // Result would be a k register having tmp1 consecutive number or 1
4170     // counting from least significant bit.
4171     address tmp = pc();
4172     emit_int64(0x0706050403020100);
4173     emit_int64(0x0F0E0D0C0B0A0908);
4174     emit_int64(0x1716151413121110);
4175     emit_int64(0x1F1E1D1C1B1A1918);
4176     emit_int64(0x2726252423222120);
4177     emit_int64(0x2F2E2D2C2B2A2928);
4178     emit_int64(0x3736353433323130);
4179     emit_int64(0x3F3E3D3C3B3A3938);
4180 
4181     bind(k_init);
4182     lea(len, InternalAddress(tmp));
4183     // create mask to test for negative byte inside a vector
4184     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
4185     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
4186 
4187 #endif
4188     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4189     ktestq(mask1, mask2);
4190     jcc(Assembler::zero, DONE);
4191 
4192     // do a full check for negative registers in the tail
4193     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4194                      // ary1 already pointing to the right place
4195     jmpb(TAIL_START);
4196 
4197     bind(BREAK_LOOP);
4198     // At least one byte in the last 64 byte block was negative.
4199     // Set up to look at the last 64 bytes as if they were a tail
4200     lea(ary1, Address(ary1, len, Address::times_1));
4201     addptr(result, len);
4202     // Ignore the very last byte: if all others are positive,
4203     // it must be negative, so we can skip right to the 2+1 byte
4204     // end comparison at this point
4205     orl(result, 63);
4206     movl(len, 63);
4207     // Fallthru to tail compare
4208   } else {
4209 
4210     if (UseAVX >= 2 && UseSSE >= 2) {
4211       // With AVX2, use 32-byte vector compare
4212       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4213 
4214       // Compare 32-byte vectors
4215       testl(len, 0xffffffe0);   // vector count (in bytes)
4216       jccb(Assembler::zero, TAIL_START);
4217 
4218       andl(len, 0xffffffe0);
4219       lea(ary1, Address(ary1, len, Address::times_1));
4220       negptr(len);
4221 
4222       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4223       movdl(vec2, tmp1);
4224       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4225 
4226       bind(COMPARE_WIDE_VECTORS);
4227       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4228       vptest(vec1, vec2);
4229       jccb(Assembler::notZero, BREAK_LOOP);
4230       addptr(len, 32);
4231       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4232 
4233       testl(result, 0x0000001f);   // any bytes remaining?
4234       jcc(Assembler::zero, DONE);
4235 
4236       // Quick test using the already prepared vector mask
4237       movl(len, result);
4238       andl(len, 0x0000001f);
4239       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4240       vptest(vec1, vec2);
4241       jcc(Assembler::zero, DONE);
4242       // There are zeros, jump to the tail to determine exactly where
4243       jmpb(TAIL_START);
4244 
4245       bind(BREAK_LOOP);
4246       // At least one byte in the last 32-byte vector is negative.
4247       // Set up to look at the last 32 bytes as if they were a tail
4248       lea(ary1, Address(ary1, len, Address::times_1));
4249       addptr(result, len);
4250       // Ignore the very last byte: if all others are positive,
4251       // it must be negative, so we can skip right to the 2+1 byte
4252       // end comparison at this point
4253       orl(result, 31);
4254       movl(len, 31);
4255       // Fallthru to tail compare
4256     } else if (UseSSE42Intrinsics) {
4257       // With SSE4.2, use double quad vector compare
4258       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4259 
4260       // Compare 16-byte vectors
4261       testl(len, 0xfffffff0);   // vector count (in bytes)
4262       jcc(Assembler::zero, TAIL_START);
4263 
4264       andl(len, 0xfffffff0);
4265       lea(ary1, Address(ary1, len, Address::times_1));
4266       negptr(len);
4267 
4268       movl(tmp1, 0x80808080);
4269       movdl(vec2, tmp1);
4270       pshufd(vec2, vec2, 0);
4271 
4272       bind(COMPARE_WIDE_VECTORS);
4273       movdqu(vec1, Address(ary1, len, Address::times_1));
4274       ptest(vec1, vec2);
4275       jccb(Assembler::notZero, BREAK_LOOP);
4276       addptr(len, 16);
4277       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4278 
4279       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4280       jcc(Assembler::zero, DONE);
4281 
4282       // Quick test using the already prepared vector mask
4283       movl(len, result);
4284       andl(len, 0x0000000f);   // tail count (in bytes)
4285       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4286       ptest(vec1, vec2);
4287       jcc(Assembler::zero, DONE);
4288       jmpb(TAIL_START);
4289 
4290       bind(BREAK_LOOP);
4291       // At least one byte in the last 16-byte vector is negative.
4292       // Set up and look at the last 16 bytes as if they were a tail
4293       lea(ary1, Address(ary1, len, Address::times_1));
4294       addptr(result, len);
4295       // Ignore the very last byte: if all others are positive,
4296       // it must be negative, so we can skip right to the 2+1 byte
4297       // end comparison at this point
4298       orl(result, 15);
4299       movl(len, 15);
4300       // Fallthru to tail compare
4301     }
4302   }
4303 
4304   bind(TAIL_START);
4305   // Compare 4-byte vectors
4306   andl(len, 0xfffffffc); // vector count (in bytes)
4307   jccb(Assembler::zero, COMPARE_CHAR);
4308 
4309   lea(ary1, Address(ary1, len, Address::times_1));
4310   negptr(len);
4311 
4312   bind(COMPARE_VECTORS);
4313   movl(tmp1, Address(ary1, len, Address::times_1));
4314   andl(tmp1, 0x80808080);
4315   jccb(Assembler::notZero, TAIL_ADJUST);
4316   addptr(len, 4);
4317   jccb(Assembler::notZero, COMPARE_VECTORS);
4318 
4319   // Compare trailing char (final 2-3 bytes), if any
4320   bind(COMPARE_CHAR);
4321 
4322   testl(result, 0x2);   // tail  char
4323   jccb(Assembler::zero, COMPARE_BYTE);
4324   load_unsigned_short(tmp1, Address(ary1, 0));
4325   andl(tmp1, 0x00008080);
4326   jccb(Assembler::notZero, CHAR_ADJUST);
4327   lea(ary1, Address(ary1, 2));
4328 
4329   bind(COMPARE_BYTE);
4330   testl(result, 0x1);   // tail  byte
4331   jccb(Assembler::zero, DONE);
4332   load_unsigned_byte(tmp1, Address(ary1, 0));
4333   testl(tmp1, 0x00000080);
4334   jccb(Assembler::zero, DONE);
4335   subptr(result, 1);
4336   jmpb(DONE);
4337 
4338   bind(TAIL_ADJUST);
4339   // there are negative bits in the last 4 byte block.
4340   // Adjust result and check the next three bytes
4341   addptr(result, len);
4342   orl(result, 3);
4343   lea(ary1, Address(ary1, len, Address::times_1));
4344   jmpb(COMPARE_CHAR);
4345 
4346   bind(CHAR_ADJUST);
4347   // We are looking at a char + optional byte tail, and found that one
4348   // of the bytes in the char is negative. Adjust the result, check the
4349   // first byte and readjust if needed.
4350   andl(result, 0xfffffffc);
4351   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4352   jccb(Assembler::notZero, DONE);
4353   addptr(result, 1);
4354 
4355   // That's it
4356   bind(DONE);
4357   if (UseAVX >= 2 && UseSSE >= 2) {
4358     // clean upper bits of YMM registers
4359     vpxor(vec1, vec1);
4360     vpxor(vec2, vec2);
4361   }
4362 }
4363 
4364 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4365 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4366                                       Register limit, Register result, Register chr,
4367                                       XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
4368   ShortBranchVerifier sbv(this);
4369   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4370 
4371   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4372   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4373 
4374   if (is_array_equ) {
4375     // Check the input args
4376     cmpoop(ary1, ary2);
4377     jcc(Assembler::equal, TRUE_LABEL);
4378 
4379     // Need additional checks for arrays_equals.
4380     testptr(ary1, ary1);
4381     jcc(Assembler::zero, FALSE_LABEL);
4382     testptr(ary2, ary2);
4383     jcc(Assembler::zero, FALSE_LABEL);
4384 
4385     // Check the lengths
4386     movl(limit, Address(ary1, length_offset));
4387     cmpl(limit, Address(ary2, length_offset));
4388     jcc(Assembler::notEqual, FALSE_LABEL);
4389   }
4390 
4391   // count == 0
4392   testl(limit, limit);
4393   jcc(Assembler::zero, TRUE_LABEL);
4394 
4395   if (is_array_equ) {
4396     // Load array address
4397     lea(ary1, Address(ary1, base_offset));
4398     lea(ary2, Address(ary2, base_offset));
4399   }
4400 
4401   if (is_array_equ && is_char) {
4402     // arrays_equals when used for char[].
4403     shll(limit, 1);      // byte count != 0
4404   }
4405   movl(result, limit); // copy
4406 
4407   if (UseAVX >= 2) {
4408     // With AVX2, use 32-byte vector compare
4409     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4410 
4411     // Compare 32-byte vectors
4412     andl(result, 0x0000001f);  //   tail count (in bytes)
4413     andl(limit, 0xffffffe0);   // vector count (in bytes)
4414     jcc(Assembler::zero, COMPARE_TAIL);
4415 
4416     lea(ary1, Address(ary1, limit, Address::times_1));
4417     lea(ary2, Address(ary2, limit, Address::times_1));
4418     negptr(limit);
4419 
4420 #ifdef _LP64
4421     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4422       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4423 
4424       cmpl(limit, -64);
4425       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4426 
4427       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4428 
4429       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4430       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4431       kortestql(mask, mask);
4432       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4433       addptr(limit, 64);  // update since we already compared at this addr
4434       cmpl(limit, -64);
4435       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4436 
4437       // At this point we may still need to compare -limit+result bytes.
4438       // We could execute the next two instruction and just continue via non-wide path:
4439       //  cmpl(limit, 0);
4440       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4441       // But since we stopped at the points ary{1,2}+limit which are
4442       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4443       // (|limit| <= 32 and result < 32),
4444       // we may just compare the last 64 bytes.
4445       //
4446       addptr(result, -64);   // it is safe, bc we just came from this area
4447       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4448       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4449       kortestql(mask, mask);
4450       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4451 
4452       jmp(TRUE_LABEL);
4453 
4454       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4455 
4456     }//if (VM_Version::supports_avx512vlbw())
4457 #endif //_LP64
4458     bind(COMPARE_WIDE_VECTORS);
4459     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
4460     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4461     vpxor(vec1, vec2);
4462 
4463     vptest(vec1, vec1);
4464     jcc(Assembler::notZero, FALSE_LABEL);
4465     addptr(limit, 32);
4466     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4467 
4468     testl(result, result);
4469     jcc(Assembler::zero, TRUE_LABEL);
4470 
4471     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
4472     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4473     vpxor(vec1, vec2);
4474 
4475     vptest(vec1, vec1);
4476     jccb(Assembler::notZero, FALSE_LABEL);
4477     jmpb(TRUE_LABEL);
4478 
4479     bind(COMPARE_TAIL); // limit is zero
4480     movl(limit, result);
4481     // Fallthru to tail compare
4482   } else if (UseSSE42Intrinsics) {
4483     // With SSE4.2, use double quad vector compare
4484     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4485 
4486     // Compare 16-byte vectors
4487     andl(result, 0x0000000f);  //   tail count (in bytes)
4488     andl(limit, 0xfffffff0);   // vector count (in bytes)
4489     jcc(Assembler::zero, COMPARE_TAIL);
4490 
4491     lea(ary1, Address(ary1, limit, Address::times_1));
4492     lea(ary2, Address(ary2, limit, Address::times_1));
4493     negptr(limit);
4494 
4495     bind(COMPARE_WIDE_VECTORS);
4496     movdqu(vec1, Address(ary1, limit, Address::times_1));
4497     movdqu(vec2, Address(ary2, limit, Address::times_1));
4498     pxor(vec1, vec2);
4499 
4500     ptest(vec1, vec1);
4501     jcc(Assembler::notZero, FALSE_LABEL);
4502     addptr(limit, 16);
4503     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4504 
4505     testl(result, result);
4506     jcc(Assembler::zero, TRUE_LABEL);
4507 
4508     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4509     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4510     pxor(vec1, vec2);
4511 
4512     ptest(vec1, vec1);
4513     jccb(Assembler::notZero, FALSE_LABEL);
4514     jmpb(TRUE_LABEL);
4515 
4516     bind(COMPARE_TAIL); // limit is zero
4517     movl(limit, result);
4518     // Fallthru to tail compare
4519   }
4520 
4521   // Compare 4-byte vectors
4522   andl(limit, 0xfffffffc); // vector count (in bytes)
4523   jccb(Assembler::zero, COMPARE_CHAR);
4524 
4525   lea(ary1, Address(ary1, limit, Address::times_1));
4526   lea(ary2, Address(ary2, limit, Address::times_1));
4527   negptr(limit);
4528 
4529   bind(COMPARE_VECTORS);
4530   movl(chr, Address(ary1, limit, Address::times_1));
4531   cmpl(chr, Address(ary2, limit, Address::times_1));
4532   jccb(Assembler::notEqual, FALSE_LABEL);
4533   addptr(limit, 4);
4534   jcc(Assembler::notZero, COMPARE_VECTORS);
4535 
4536   // Compare trailing char (final 2 bytes), if any
4537   bind(COMPARE_CHAR);
4538   testl(result, 0x2);   // tail  char
4539   jccb(Assembler::zero, COMPARE_BYTE);
4540   load_unsigned_short(chr, Address(ary1, 0));
4541   load_unsigned_short(limit, Address(ary2, 0));
4542   cmpl(chr, limit);
4543   jccb(Assembler::notEqual, FALSE_LABEL);
4544 
4545   if (is_array_equ && is_char) {
4546     bind(COMPARE_BYTE);
4547   } else {
4548     lea(ary1, Address(ary1, 2));
4549     lea(ary2, Address(ary2, 2));
4550 
4551     bind(COMPARE_BYTE);
4552     testl(result, 0x1);   // tail  byte
4553     jccb(Assembler::zero, TRUE_LABEL);
4554     load_unsigned_byte(chr, Address(ary1, 0));
4555     load_unsigned_byte(limit, Address(ary2, 0));
4556     cmpl(chr, limit);
4557     jccb(Assembler::notEqual, FALSE_LABEL);
4558   }
4559   bind(TRUE_LABEL);
4560   movl(result, 1);   // return true
4561   jmpb(DONE);
4562 
4563   bind(FALSE_LABEL);
4564   xorl(result, result); // return false
4565 
4566   // That's it
4567   bind(DONE);
4568   if (UseAVX >= 2) {
4569     // clean upper bits of YMM registers
4570     vpxor(vec1, vec1);
4571     vpxor(vec2, vec2);
4572   }
4573 }
4574 
4575 #ifdef _LP64
4576 
4577 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4578 #define __ masm.
4579   Register dst = stub.data<0>();
4580   XMMRegister src = stub.data<1>();
4581   address target = stub.data<2>();
4582   __ bind(stub.entry());
4583   __ subptr(rsp, 8);
4584   __ movdbl(Address(rsp), src);
4585   __ call(RuntimeAddress(target));
4586   __ pop(dst);
4587   __ jmp(stub.continuation());
4588 #undef __
4589 }
4590 
4591 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4592   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4593   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4594 
4595   address slowpath_target;
4596   if (dst_bt == T_INT) {
4597     if (src_bt == T_FLOAT) {
4598       cvttss2sil(dst, src);
4599       cmpl(dst, 0x80000000);
4600       slowpath_target = StubRoutines::x86::f2i_fixup();
4601     } else {
4602       cvttsd2sil(dst, src);
4603       cmpl(dst, 0x80000000);
4604       slowpath_target = StubRoutines::x86::d2i_fixup();
4605     }
4606   } else {
4607     if (src_bt == T_FLOAT) {
4608       cvttss2siq(dst, src);
4609       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4610       slowpath_target = StubRoutines::x86::f2l_fixup();
4611     } else {
4612       cvttsd2siq(dst, src);
4613       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4614       slowpath_target = StubRoutines::x86::d2l_fixup();
4615     }
4616   }
4617 
4618   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4619   jcc(Assembler::equal, stub->entry());
4620   bind(stub->continuation());
4621 }
4622 
4623 #endif // _LP64
4624 
4625 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4626                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4627   switch(ideal_opc) {
4628     case Op_LShiftVS:
4629       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4630     case Op_LShiftVI:
4631       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4632     case Op_LShiftVL:
4633       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4634     case Op_RShiftVS:
4635       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4636     case Op_RShiftVI:
4637       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4638     case Op_RShiftVL:
4639       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4640     case Op_URShiftVS:
4641       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4642     case Op_URShiftVI:
4643       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4644     case Op_URShiftVL:
4645       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4646     case Op_RotateRightV:
4647       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4648     case Op_RotateLeftV:
4649       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4650     default:
4651       fatal("Unsupported masked operation"); break;
4652   }
4653 }
4654 
4655 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4656                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4657                                     bool is_varshift) {
4658   switch (ideal_opc) {
4659     case Op_AddVB:
4660       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4661     case Op_AddVS:
4662       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4663     case Op_AddVI:
4664       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4665     case Op_AddVL:
4666       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4667     case Op_AddVF:
4668       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4669     case Op_AddVD:
4670       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4671     case Op_SubVB:
4672       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4673     case Op_SubVS:
4674       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4675     case Op_SubVI:
4676       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4677     case Op_SubVL:
4678       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4679     case Op_SubVF:
4680       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4681     case Op_SubVD:
4682       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4683     case Op_MulVS:
4684       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4685     case Op_MulVI:
4686       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4687     case Op_MulVL:
4688       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4689     case Op_MulVF:
4690       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4691     case Op_MulVD:
4692       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4693     case Op_DivVF:
4694       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4695     case Op_DivVD:
4696       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4697     case Op_SqrtVF:
4698       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4699     case Op_SqrtVD:
4700       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4701     case Op_AbsVB:
4702       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4703     case Op_AbsVS:
4704       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4705     case Op_AbsVI:
4706       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4707     case Op_AbsVL:
4708       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4709     case Op_FmaVF:
4710       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4711     case Op_FmaVD:
4712       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4713     case Op_VectorRearrange:
4714       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4715     case Op_LShiftVS:
4716       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4717     case Op_LShiftVI:
4718       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4719     case Op_LShiftVL:
4720       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4721     case Op_RShiftVS:
4722       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4723     case Op_RShiftVI:
4724       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4725     case Op_RShiftVL:
4726       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4727     case Op_URShiftVS:
4728       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4729     case Op_URShiftVI:
4730       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4731     case Op_URShiftVL:
4732       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4733     case Op_RotateLeftV:
4734       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4735     case Op_RotateRightV:
4736       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4737     case Op_MaxV:
4738       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4739     case Op_MinV:
4740       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4741     case Op_XorV:
4742       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4743     case Op_OrV:
4744       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4745     case Op_AndV:
4746       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4747     default:
4748       fatal("Unsupported masked operation"); break;
4749   }
4750 }
4751 
4752 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4753                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4754   switch (ideal_opc) {
4755     case Op_AddVB:
4756       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4757     case Op_AddVS:
4758       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4759     case Op_AddVI:
4760       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4761     case Op_AddVL:
4762       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4763     case Op_AddVF:
4764       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4765     case Op_AddVD:
4766       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4767     case Op_SubVB:
4768       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4769     case Op_SubVS:
4770       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4771     case Op_SubVI:
4772       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4773     case Op_SubVL:
4774       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4775     case Op_SubVF:
4776       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4777     case Op_SubVD:
4778       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4779     case Op_MulVS:
4780       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4781     case Op_MulVI:
4782       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4783     case Op_MulVL:
4784       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4785     case Op_MulVF:
4786       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4787     case Op_MulVD:
4788       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4789     case Op_DivVF:
4790       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4791     case Op_DivVD:
4792       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4793     case Op_FmaVF:
4794       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4795     case Op_FmaVD:
4796       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4797     case Op_MaxV:
4798       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4799     case Op_MinV:
4800       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4801     case Op_XorV:
4802       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4803     case Op_OrV:
4804       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4805     case Op_AndV:
4806       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4807     default:
4808       fatal("Unsupported masked operation"); break;
4809   }
4810 }
4811 
4812 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4813                                   KRegister src1, KRegister src2) {
4814   BasicType etype = T_ILLEGAL;
4815   switch(mask_len) {
4816     case 2:
4817     case 4:
4818     case 8:  etype = T_BYTE; break;
4819     case 16: etype = T_SHORT; break;
4820     case 32: etype = T_INT; break;
4821     case 64: etype = T_LONG; break;
4822     default: fatal("Unsupported type"); break;
4823   }
4824   assert(etype != T_ILLEGAL, "");
4825   switch(ideal_opc) {
4826     case Op_AndVMask:
4827       kand(etype, dst, src1, src2); break;
4828     case Op_OrVMask:
4829       kor(etype, dst, src1, src2); break;
4830     case Op_XorVMask:
4831       kxor(etype, dst, src1, src2); break;
4832     default:
4833       fatal("Unsupported masked operation"); break;
4834   }
4835 }
4836 
4837 /*
4838  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4839  * If src is NaN, the result is 0.
4840  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4841  * the result is equal to the value of Integer.MIN_VALUE.
4842  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4843  * the result is equal to the value of Integer.MAX_VALUE.
4844  */
4845 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4846                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4847                                                                    Register rscratch, AddressLiteral float_sign_flip,
4848                                                                    int vec_enc) {
4849   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4850   Label done;
4851   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4852   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4853   vptest(xtmp2, xtmp2, vec_enc);
4854   jccb(Assembler::equal, done);
4855 
4856   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4857   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4858 
4859   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4860   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4861   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4862 
4863   // Recompute the mask for remaining special value.
4864   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4865   // Extract SRC values corresponding to TRUE mask lanes.
4866   vpand(xtmp4, xtmp2, src, vec_enc);
4867   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4868   // values are set.
4869   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4870 
4871   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4872   bind(done);
4873 }
4874 
4875 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4876                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4877                                                                     Register rscratch, AddressLiteral float_sign_flip,
4878                                                                     int vec_enc) {
4879   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4880   Label done;
4881   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4882   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4883   kortestwl(ktmp1, ktmp1);
4884   jccb(Assembler::equal, done);
4885 
4886   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4887   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4888   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4889 
4890   kxorwl(ktmp1, ktmp1, ktmp2);
4891   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4892   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4893   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4894   bind(done);
4895 }
4896 
4897 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4898                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4899                                                                      Register rscratch, AddressLiteral double_sign_flip,
4900                                                                      int vec_enc) {
4901   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4902 
4903   Label done;
4904   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4905   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4906   kortestwl(ktmp1, ktmp1);
4907   jccb(Assembler::equal, done);
4908 
4909   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4910   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4911   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4912 
4913   kxorwl(ktmp1, ktmp1, ktmp2);
4914   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4915   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4916   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4917   bind(done);
4918 }
4919 
4920 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4921                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4922                                                                      Register rscratch, AddressLiteral float_sign_flip,
4923                                                                      int vec_enc) {
4924   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4925   Label done;
4926   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4927   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4928   kortestwl(ktmp1, ktmp1);
4929   jccb(Assembler::equal, done);
4930 
4931   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4932   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4933   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4934 
4935   kxorwl(ktmp1, ktmp1, ktmp2);
4936   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4937   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4938   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4939   bind(done);
4940 }
4941 
4942 /*
4943  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4944  * If src is NaN, the result is 0.
4945  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4946  * the result is equal to the value of Long.MIN_VALUE.
4947  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4948  * the result is equal to the value of Long.MAX_VALUE.
4949  */
4950 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4951                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4952                                                                       Register rscratch, AddressLiteral double_sign_flip,
4953                                                                       int vec_enc) {
4954   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4955 
4956   Label done;
4957   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4958   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4959   kortestwl(ktmp1, ktmp1);
4960   jccb(Assembler::equal, done);
4961 
4962   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4963   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4964   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4965 
4966   kxorwl(ktmp1, ktmp1, ktmp2);
4967   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4968   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4969   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4970   bind(done);
4971 }
4972 
4973 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
4974                                                              XMMRegister xtmp, int index, int vec_enc) {
4975    assert(vec_enc < Assembler::AVX_512bit, "");
4976    if (vec_enc == Assembler::AVX_256bit) {
4977      vextractf128_high(xtmp, src);
4978      vshufps(dst, src, xtmp, index, vec_enc);
4979    } else {
4980      vshufps(dst, src, zero, index, vec_enc);
4981    }
4982 }
4983 
4984 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4985                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
4986                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
4987   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4988 
4989   Label done;
4990   // Compare the destination lanes with float_sign_flip
4991   // value to get mask for all special values.
4992   movdqu(xtmp1, float_sign_flip, rscratch);
4993   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
4994   ptest(xtmp2, xtmp2);
4995   jccb(Assembler::equal, done);
4996 
4997   // Flip float_sign_flip to get max integer value.
4998   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
4999   pxor(xtmp1, xtmp4);
5000 
5001   // Set detination lanes corresponding to unordered source lanes as zero.
5002   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5003   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5004 
5005   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5006   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5007   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5008 
5009   // Recompute the mask for remaining special value.
5010   pxor(xtmp2, xtmp3);
5011   // Extract mask corresponding to non-negative source lanes.
5012   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5013 
5014   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5015   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5016   pand(xtmp3, xtmp2);
5017 
5018   // Replace destination lanes holding special value(0x80000000) with max int
5019   // if corresponding source lane holds a +ve value.
5020   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5021   bind(done);
5022 }
5023 
5024 
5025 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5026                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5027   switch(to_elem_bt) {
5028     case T_SHORT:
5029       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5030       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5031       vpackusdw(dst, dst, zero, vec_enc);
5032       if (vec_enc == Assembler::AVX_256bit) {
5033         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5034       }
5035       break;
5036     case  T_BYTE:
5037       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5038       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5039       vpackusdw(dst, dst, zero, vec_enc);
5040       if (vec_enc == Assembler::AVX_256bit) {
5041         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5042       }
5043       vpackuswb(dst, dst, zero, vec_enc);
5044       break;
5045     default: assert(false, "%s", type2name(to_elem_bt));
5046   }
5047 }
5048 
5049 /*
5050  * Algorithm for vector D2L and F2I conversions:-
5051  * a) Perform vector D2L/F2I cast.
5052  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5053  *    It signifies that source value could be any of the special floating point
5054  *    values(NaN,-Inf,Inf,Max,-Min).
5055  * c) Set destination to zero if source is NaN value.
5056  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5057  */
5058 
5059 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5060                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5061                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5062   int to_elem_sz = type2aelembytes(to_elem_bt);
5063   assert(to_elem_sz <= 4, "");
5064   vcvttps2dq(dst, src, vec_enc);
5065   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5066   if (to_elem_sz < 4) {
5067     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5068     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5069   }
5070 }
5071 
5072 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5073                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5074                                             Register rscratch, int vec_enc) {
5075   int to_elem_sz = type2aelembytes(to_elem_bt);
5076   assert(to_elem_sz <= 4, "");
5077   vcvttps2dq(dst, src, vec_enc);
5078   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5079   switch(to_elem_bt) {
5080     case T_INT:
5081       break;
5082     case T_SHORT:
5083       evpmovdw(dst, dst, vec_enc);
5084       break;
5085     case T_BYTE:
5086       evpmovdb(dst, dst, vec_enc);
5087       break;
5088     default: assert(false, "%s", type2name(to_elem_bt));
5089   }
5090 }
5091 
5092 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5093                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5094                                             Register rscratch, int vec_enc) {
5095   evcvttps2qq(dst, src, vec_enc);
5096   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5097 }
5098 
5099 // Handling for downcasting from double to integer or sub-word types on AVX2.
5100 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5101                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5102                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5103   int to_elem_sz = type2aelembytes(to_elem_bt);
5104   assert(to_elem_sz < 8, "");
5105   vcvttpd2dq(dst, src, vec_enc);
5106   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5107                                               float_sign_flip, vec_enc);
5108   if (to_elem_sz < 4) {
5109     // xtmp4 holds all zero lanes.
5110     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5111   }
5112 }
5113 
5114 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5115                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5116                                             KRegister ktmp2, AddressLiteral sign_flip,
5117                                             Register rscratch, int vec_enc) {
5118   if (VM_Version::supports_avx512dq()) {
5119     evcvttpd2qq(dst, src, vec_enc);
5120     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5121     switch(to_elem_bt) {
5122       case T_LONG:
5123         break;
5124       case T_INT:
5125         evpmovsqd(dst, dst, vec_enc);
5126         break;
5127       case T_SHORT:
5128         evpmovsqd(dst, dst, vec_enc);
5129         evpmovdw(dst, dst, vec_enc);
5130         break;
5131       case T_BYTE:
5132         evpmovsqd(dst, dst, vec_enc);
5133         evpmovdb(dst, dst, vec_enc);
5134         break;
5135       default: assert(false, "%s", type2name(to_elem_bt));
5136     }
5137   } else {
5138     assert(type2aelembytes(to_elem_bt) <= 4, "");
5139     vcvttpd2dq(dst, src, vec_enc);
5140     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5141     switch(to_elem_bt) {
5142       case T_INT:
5143         break;
5144       case T_SHORT:
5145         evpmovdw(dst, dst, vec_enc);
5146         break;
5147       case T_BYTE:
5148         evpmovdb(dst, dst, vec_enc);
5149         break;
5150       default: assert(false, "%s", type2name(to_elem_bt));
5151     }
5152   }
5153 }
5154 
5155 #ifdef _LP64
5156 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5157                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5158                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5159   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5160   // and re-instantiate original MXCSR.RC mode after that.
5161   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5162 
5163   mov64(tmp, julong_cast(0.5L));
5164   evpbroadcastq(xtmp1, tmp, vec_enc);
5165   vaddpd(xtmp1, src , xtmp1, vec_enc);
5166   evcvtpd2qq(dst, xtmp1, vec_enc);
5167   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5168                                                 double_sign_flip, vec_enc);;
5169 
5170   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5171 }
5172 
5173 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5174                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5175                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5176   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5177   // and re-instantiate original MXCSR.RC mode after that.
5178   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5179 
5180   movl(tmp, jint_cast(0.5));
5181   movq(xtmp1, tmp);
5182   vbroadcastss(xtmp1, xtmp1, vec_enc);
5183   vaddps(xtmp1, src , xtmp1, vec_enc);
5184   vcvtps2dq(dst, xtmp1, vec_enc);
5185   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5186                                               float_sign_flip, vec_enc);
5187 
5188   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5189 }
5190 
5191 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5192                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5193                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5194   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5195   // and re-instantiate original MXCSR.RC mode after that.
5196   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5197 
5198   movl(tmp, jint_cast(0.5));
5199   movq(xtmp1, tmp);
5200   vbroadcastss(xtmp1, xtmp1, vec_enc);
5201   vaddps(xtmp1, src , xtmp1, vec_enc);
5202   vcvtps2dq(dst, xtmp1, vec_enc);
5203   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5204 
5205   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5206 }
5207 #endif // _LP64
5208 
5209 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5210                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5211   switch (from_elem_bt) {
5212     case T_BYTE:
5213       switch (to_elem_bt) {
5214         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5215         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5216         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5217         default: ShouldNotReachHere();
5218       }
5219       break;
5220     case T_SHORT:
5221       switch (to_elem_bt) {
5222         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5223         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5224         default: ShouldNotReachHere();
5225       }
5226       break;
5227     case T_INT:
5228       assert(to_elem_bt == T_LONG, "");
5229       vpmovzxdq(dst, src, vlen_enc);
5230       break;
5231     default:
5232       ShouldNotReachHere();
5233   }
5234 }
5235 
5236 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5237                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5238   switch (from_elem_bt) {
5239     case T_BYTE:
5240       switch (to_elem_bt) {
5241         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5242         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5243         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5244         default: ShouldNotReachHere();
5245       }
5246       break;
5247     case T_SHORT:
5248       switch (to_elem_bt) {
5249         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5250         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5251         default: ShouldNotReachHere();
5252       }
5253       break;
5254     case T_INT:
5255       assert(to_elem_bt == T_LONG, "");
5256       vpmovsxdq(dst, src, vlen_enc);
5257       break;
5258     default:
5259       ShouldNotReachHere();
5260   }
5261 }
5262 
5263 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5264                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5265   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5266   assert(vlen_enc != AVX_512bit, "");
5267 
5268   int dst_bt_size = type2aelembytes(dst_bt);
5269   int src_bt_size = type2aelembytes(src_bt);
5270   if (dst_bt_size > src_bt_size) {
5271     switch (dst_bt_size / src_bt_size) {
5272       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5273       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5274       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5275       default: ShouldNotReachHere();
5276     }
5277   } else {
5278     assert(dst_bt_size < src_bt_size, "");
5279     switch (src_bt_size / dst_bt_size) {
5280       case 2: {
5281         if (vlen_enc == AVX_128bit) {
5282           vpacksswb(dst, src, src, vlen_enc);
5283         } else {
5284           vpacksswb(dst, src, src, vlen_enc);
5285           vpermq(dst, dst, 0x08, vlen_enc);
5286         }
5287         break;
5288       }
5289       case 4: {
5290         if (vlen_enc == AVX_128bit) {
5291           vpackssdw(dst, src, src, vlen_enc);
5292           vpacksswb(dst, dst, dst, vlen_enc);
5293         } else {
5294           vpackssdw(dst, src, src, vlen_enc);
5295           vpermq(dst, dst, 0x08, vlen_enc);
5296           vpacksswb(dst, dst, dst, AVX_128bit);
5297         }
5298         break;
5299       }
5300       case 8: {
5301         if (vlen_enc == AVX_128bit) {
5302           vpshufd(dst, src, 0x08, vlen_enc);
5303           vpackssdw(dst, dst, dst, vlen_enc);
5304           vpacksswb(dst, dst, dst, vlen_enc);
5305         } else {
5306           vpshufd(dst, src, 0x08, vlen_enc);
5307           vpermq(dst, dst, 0x08, vlen_enc);
5308           vpackssdw(dst, dst, dst, AVX_128bit);
5309           vpacksswb(dst, dst, dst, AVX_128bit);
5310         }
5311         break;
5312       }
5313       default: ShouldNotReachHere();
5314     }
5315   }
5316 }
5317 
5318 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5319                                    bool merge, BasicType bt, int vlen_enc) {
5320   if (bt == T_INT) {
5321     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5322   } else {
5323     assert(bt == T_LONG, "");
5324     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5325   }
5326 }
5327 
5328 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5329                                    bool merge, BasicType bt, int vlen_enc) {
5330   if (bt == T_INT) {
5331     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5332   } else {
5333     assert(bt == T_LONG, "");
5334     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5335   }
5336 }
5337 
5338 #ifdef _LP64
5339 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5340                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5341                                                int vec_enc) {
5342   int index = 0;
5343   int vindex = 0;
5344   mov64(rtmp1, 0x0101010101010101L);
5345   pdepq(rtmp1, src, rtmp1);
5346   if (mask_len > 8) {
5347     movq(rtmp2, src);
5348     vpxor(xtmp, xtmp, xtmp, vec_enc);
5349     movq(xtmp, rtmp1);
5350   }
5351   movq(dst, rtmp1);
5352 
5353   mask_len -= 8;
5354   while (mask_len > 0) {
5355     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5356     index++;
5357     if ((index % 2) == 0) {
5358       pxor(xtmp, xtmp);
5359     }
5360     mov64(rtmp1, 0x0101010101010101L);
5361     shrq(rtmp2, 8);
5362     pdepq(rtmp1, rtmp2, rtmp1);
5363     pinsrq(xtmp, rtmp1, index % 2);
5364     vindex = index / 2;
5365     if (vindex) {
5366       // Write entire 16 byte vector when both 64 bit
5367       // lanes are update to save redundant instructions.
5368       if (index % 2) {
5369         vinsertf128(dst, dst, xtmp, vindex);
5370       }
5371     } else {
5372       vmovdqu(dst, xtmp);
5373     }
5374     mask_len -= 8;
5375   }
5376 }
5377 
5378 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5379   switch(opc) {
5380     case Op_VectorMaskTrueCount:
5381       popcntq(dst, tmp);
5382       break;
5383     case Op_VectorMaskLastTrue:
5384       if (VM_Version::supports_lzcnt()) {
5385         lzcntq(tmp, tmp);
5386         movl(dst, 63);
5387         subl(dst, tmp);
5388       } else {
5389         movl(dst, -1);
5390         bsrq(tmp, tmp);
5391         cmov32(Assembler::notZero, dst, tmp);
5392       }
5393       break;
5394     case Op_VectorMaskFirstTrue:
5395       if (VM_Version::supports_bmi1()) {
5396         if (masklen < 32) {
5397           orl(tmp, 1 << masklen);
5398           tzcntl(dst, tmp);
5399         } else if (masklen == 32) {
5400           tzcntl(dst, tmp);
5401         } else {
5402           assert(masklen == 64, "");
5403           tzcntq(dst, tmp);
5404         }
5405       } else {
5406         if (masklen < 32) {
5407           orl(tmp, 1 << masklen);
5408           bsfl(dst, tmp);
5409         } else {
5410           assert(masklen == 32 || masklen == 64, "");
5411           movl(dst, masklen);
5412           if (masklen == 32)  {
5413             bsfl(tmp, tmp);
5414           } else {
5415             bsfq(tmp, tmp);
5416           }
5417           cmov32(Assembler::notZero, dst, tmp);
5418         }
5419       }
5420       break;
5421     case Op_VectorMaskToLong:
5422       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5423       break;
5424     default: assert(false, "Unhandled mask operation");
5425   }
5426 }
5427 
5428 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5429                                               int masklen, int masksize, int vec_enc) {
5430   assert(VM_Version::supports_popcnt(), "");
5431 
5432   if(VM_Version::supports_avx512bw()) {
5433     kmovql(tmp, mask);
5434   } else {
5435     assert(masklen <= 16, "");
5436     kmovwl(tmp, mask);
5437   }
5438 
5439   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5440   // operations needs to be clipped.
5441   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5442     andq(tmp, (1 << masklen) - 1);
5443   }
5444 
5445   vector_mask_operation_helper(opc, dst, tmp, masklen);
5446 }
5447 
5448 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5449                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5450   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5451          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5452   assert(VM_Version::supports_popcnt(), "");
5453 
5454   bool need_clip = false;
5455   switch(bt) {
5456     case T_BOOLEAN:
5457       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5458       vpxor(xtmp, xtmp, xtmp, vec_enc);
5459       vpsubb(xtmp, xtmp, mask, vec_enc);
5460       vpmovmskb(tmp, xtmp, vec_enc);
5461       need_clip = masklen < 16;
5462       break;
5463     case T_BYTE:
5464       vpmovmskb(tmp, mask, vec_enc);
5465       need_clip = masklen < 16;
5466       break;
5467     case T_SHORT:
5468       vpacksswb(xtmp, mask, mask, vec_enc);
5469       if (masklen >= 16) {
5470         vpermpd(xtmp, xtmp, 8, vec_enc);
5471       }
5472       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5473       need_clip = masklen < 16;
5474       break;
5475     case T_INT:
5476     case T_FLOAT:
5477       vmovmskps(tmp, mask, vec_enc);
5478       need_clip = masklen < 4;
5479       break;
5480     case T_LONG:
5481     case T_DOUBLE:
5482       vmovmskpd(tmp, mask, vec_enc);
5483       need_clip = masklen < 2;
5484       break;
5485     default: assert(false, "Unhandled type, %s", type2name(bt));
5486   }
5487 
5488   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5489   // operations needs to be clipped.
5490   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5491     // need_clip implies masklen < 32
5492     andq(tmp, (1 << masklen) - 1);
5493   }
5494 
5495   vector_mask_operation_helper(opc, dst, tmp, masklen);
5496 }
5497 
5498 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5499                                              Register rtmp2, int mask_len) {
5500   kmov(rtmp1, src);
5501   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5502   mov64(rtmp2, -1L);
5503   pextq(rtmp2, rtmp2, rtmp1);
5504   kmov(dst, rtmp2);
5505 }
5506 
5507 #ifdef _LP64
5508 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5509                                                     XMMRegister mask, Register rtmp, Register rscratch,
5510                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5511                                                     int vec_enc) {
5512   assert(type2aelembytes(bt) >= 4, "");
5513   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5514   address compress_perm_table = nullptr;
5515   address expand_perm_table = nullptr;
5516   if (type2aelembytes(bt) == 8) {
5517     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5518     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5519     vmovmskpd(rtmp, mask, vec_enc);
5520   } else {
5521     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5522     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5523     vmovmskps(rtmp, mask, vec_enc);
5524   }
5525   shlq(rtmp, 5); // for 32 byte permute row.
5526   if (opcode == Op_CompressV) {
5527     lea(rscratch, ExternalAddress(compress_perm_table));
5528   } else {
5529     lea(rscratch, ExternalAddress(expand_perm_table));
5530   }
5531   addptr(rtmp, rscratch);
5532   vmovdqu(permv, Address(rtmp));
5533   vpermps(dst, permv, src, Assembler::AVX_256bit);
5534   vpxor(xtmp, xtmp, xtmp, vec_enc);
5535   // Blend the result with zero vector using permute mask, each column entry
5536   // in a permute table row contains either a valid permute index or a -1 (default)
5537   // value, this can potentially be used as a blending mask after
5538   // compressing/expanding the source vector lanes.
5539   vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
5540 }
5541 #endif
5542 
5543 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5544                                                bool merge, BasicType bt, int vec_enc) {
5545   if (opcode == Op_CompressV) {
5546     switch(bt) {
5547     case T_BYTE:
5548       evpcompressb(dst, mask, src, merge, vec_enc);
5549       break;
5550     case T_CHAR:
5551     case T_SHORT:
5552       evpcompressw(dst, mask, src, merge, vec_enc);
5553       break;
5554     case T_INT:
5555       evpcompressd(dst, mask, src, merge, vec_enc);
5556       break;
5557     case T_FLOAT:
5558       evcompressps(dst, mask, src, merge, vec_enc);
5559       break;
5560     case T_LONG:
5561       evpcompressq(dst, mask, src, merge, vec_enc);
5562       break;
5563     case T_DOUBLE:
5564       evcompresspd(dst, mask, src, merge, vec_enc);
5565       break;
5566     default:
5567       fatal("Unsupported type %s", type2name(bt));
5568       break;
5569     }
5570   } else {
5571     assert(opcode == Op_ExpandV, "");
5572     switch(bt) {
5573     case T_BYTE:
5574       evpexpandb(dst, mask, src, merge, vec_enc);
5575       break;
5576     case T_CHAR:
5577     case T_SHORT:
5578       evpexpandw(dst, mask, src, merge, vec_enc);
5579       break;
5580     case T_INT:
5581       evpexpandd(dst, mask, src, merge, vec_enc);
5582       break;
5583     case T_FLOAT:
5584       evexpandps(dst, mask, src, merge, vec_enc);
5585       break;
5586     case T_LONG:
5587       evpexpandq(dst, mask, src, merge, vec_enc);
5588       break;
5589     case T_DOUBLE:
5590       evexpandpd(dst, mask, src, merge, vec_enc);
5591       break;
5592     default:
5593       fatal("Unsupported type %s", type2name(bt));
5594       break;
5595     }
5596   }
5597 }
5598 #endif
5599 
5600 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5601                                            KRegister ktmp1, int vec_enc) {
5602   if (opcode == Op_SignumVD) {
5603     vsubpd(dst, zero, one, vec_enc);
5604     // if src < 0 ? -1 : 1
5605     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5606     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5607     // if src == NaN, -0.0 or 0.0 return src.
5608     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5609     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5610   } else {
5611     assert(opcode == Op_SignumVF, "");
5612     vsubps(dst, zero, one, vec_enc);
5613     // if src < 0 ? -1 : 1
5614     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5615     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5616     // if src == NaN, -0.0 or 0.0 return src.
5617     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5618     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5619   }
5620 }
5621 
5622 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5623                                           XMMRegister xtmp1, int vec_enc) {
5624   if (opcode == Op_SignumVD) {
5625     vsubpd(dst, zero, one, vec_enc);
5626     // if src < 0 ? -1 : 1
5627     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5628     // if src == NaN, -0.0 or 0.0 return src.
5629     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5630     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5631   } else {
5632     assert(opcode == Op_SignumVF, "");
5633     vsubps(dst, zero, one, vec_enc);
5634     // if src < 0 ? -1 : 1
5635     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5636     // if src == NaN, -0.0 or 0.0 return src.
5637     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5638     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5639   }
5640 }
5641 
5642 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5643   if (VM_Version::supports_avx512bw()) {
5644     if (mask_len > 32) {
5645       kmovql(dst, src);
5646     } else {
5647       kmovdl(dst, src);
5648       if (mask_len != 32) {
5649         kshiftrdl(dst, dst, 32 - mask_len);
5650       }
5651     }
5652   } else {
5653     assert(mask_len <= 16, "");
5654     kmovwl(dst, src);
5655     if (mask_len != 16) {
5656       kshiftrwl(dst, dst, 16 - mask_len);
5657     }
5658   }
5659 }
5660 
5661 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5662   int lane_size = type2aelembytes(bt);
5663   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5664   if ((is_LP64 || lane_size < 8) &&
5665       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5666        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5667     movptr(rtmp, imm32);
5668     switch(lane_size) {
5669       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5670       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5671       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5672       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5673       fatal("Unsupported lane size %d", lane_size);
5674       break;
5675     }
5676   } else {
5677     movptr(rtmp, imm32);
5678     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5679     switch(lane_size) {
5680       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5681       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5682       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5683       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5684       fatal("Unsupported lane size %d", lane_size);
5685       break;
5686     }
5687   }
5688 }
5689 
5690 //
5691 // Following is lookup table based popcount computation algorithm:-
5692 //       Index   Bit set count
5693 //     [ 0000 ->   0,
5694 //       0001 ->   1,
5695 //       0010 ->   1,
5696 //       0011 ->   2,
5697 //       0100 ->   1,
5698 //       0101 ->   2,
5699 //       0110 ->   2,
5700 //       0111 ->   3,
5701 //       1000 ->   1,
5702 //       1001 ->   2,
5703 //       1010 ->   3,
5704 //       1011 ->   3,
5705 //       1100 ->   2,
5706 //       1101 ->   3,
5707 //       1111 ->   4 ]
5708 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5709 //     shuffle indices for lookup table access.
5710 //  b. Right shift each byte of vector lane by 4 positions.
5711 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5712 //     shuffle indices for lookup table access.
5713 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5714 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5715 //     count of all the bytes of a quadword.
5716 //  f. Perform step e. for upper 128bit vector lane.
5717 //  g. Pack the bitset count of quadwords back to double word.
5718 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5719 
5720 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5721                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5722   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5723   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5724   vpsrlw(dst, src, 4, vec_enc);
5725   vpand(dst, dst, xtmp1, vec_enc);
5726   vpand(xtmp1, src, xtmp1, vec_enc);
5727   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5728   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5729   vpshufb(dst, xtmp2, dst, vec_enc);
5730   vpaddb(dst, dst, xtmp1, vec_enc);
5731 }
5732 
5733 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5734                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5735   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5736   // Following code is as per steps e,f,g and h of above algorithm.
5737   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5738   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5739   vpsadbw(dst, dst, xtmp2, vec_enc);
5740   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5741   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5742   vpackuswb(dst, xtmp1, dst, vec_enc);
5743 }
5744 
5745 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5746                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5747   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5748   // Add the popcount of upper and lower bytes of word.
5749   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5750   vpsrlw(dst, xtmp1, 8, vec_enc);
5751   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5752   vpaddw(dst, dst, xtmp1, vec_enc);
5753 }
5754 
5755 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5756                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5757   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5758   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5759   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5760 }
5761 
5762 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5763                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5764   switch(bt) {
5765     case T_LONG:
5766       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5767       break;
5768     case T_INT:
5769       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5770       break;
5771     case T_CHAR:
5772     case T_SHORT:
5773       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5774       break;
5775     case T_BYTE:
5776     case T_BOOLEAN:
5777       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5778       break;
5779     default:
5780       fatal("Unsupported type %s", type2name(bt));
5781       break;
5782   }
5783 }
5784 
5785 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5786                                                       KRegister mask, bool merge, int vec_enc) {
5787   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5788   switch(bt) {
5789     case T_LONG:
5790       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5791       evpopcntq(dst, mask, src, merge, vec_enc);
5792       break;
5793     case T_INT:
5794       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5795       evpopcntd(dst, mask, src, merge, vec_enc);
5796       break;
5797     case T_CHAR:
5798     case T_SHORT:
5799       assert(VM_Version::supports_avx512_bitalg(), "");
5800       evpopcntw(dst, mask, src, merge, vec_enc);
5801       break;
5802     case T_BYTE:
5803     case T_BOOLEAN:
5804       assert(VM_Version::supports_avx512_bitalg(), "");
5805       evpopcntb(dst, mask, src, merge, vec_enc);
5806       break;
5807     default:
5808       fatal("Unsupported type %s", type2name(bt));
5809       break;
5810   }
5811 }
5812 
5813 #ifndef _LP64
5814 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
5815   assert(VM_Version::supports_avx512bw(), "");
5816   kmovdl(tmp, src);
5817   kunpckdql(dst, tmp, tmp);
5818 }
5819 #endif
5820 
5821 // Bit reversal algorithm first reverses the bits of each byte followed by
5822 // a byte level reversal for multi-byte primitive types (short/int/long).
5823 // Algorithm performs a lookup table access to get reverse bit sequence
5824 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5825 // is obtained by swapping the reverse bit sequences of upper and lower
5826 // nibble of a byte.
5827 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5828                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5829   if (VM_Version::supports_avx512vlbw()) {
5830 
5831     // Get the reverse bit sequence of lower nibble of each byte.
5832     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5833     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5834     evpandq(dst, xtmp2, src, vec_enc);
5835     vpshufb(dst, xtmp1, dst, vec_enc);
5836     vpsllq(dst, dst, 4, vec_enc);
5837 
5838     // Get the reverse bit sequence of upper nibble of each byte.
5839     vpandn(xtmp2, xtmp2, src, vec_enc);
5840     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5841     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5842 
5843     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5844     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5845     evporq(xtmp2, dst, xtmp2, vec_enc);
5846     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5847 
5848   } else if(vec_enc == Assembler::AVX_512bit) {
5849     // Shift based bit reversal.
5850     assert(bt == T_LONG || bt == T_INT, "");
5851 
5852     // Swap lower and upper nibble of each byte.
5853     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5854 
5855     // Swap two least and most significant bits of each nibble.
5856     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5857 
5858     // Swap adjacent pair of bits.
5859     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5860     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5861 
5862     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5863     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5864   } else {
5865     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5866     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5867 
5868     // Get the reverse bit sequence of lower nibble of each byte.
5869     vpand(dst, xtmp2, src, vec_enc);
5870     vpshufb(dst, xtmp1, dst, vec_enc);
5871     vpsllq(dst, dst, 4, vec_enc);
5872 
5873     // Get the reverse bit sequence of upper nibble of each byte.
5874     vpandn(xtmp2, xtmp2, src, vec_enc);
5875     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5876     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5877 
5878     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5879     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5880     vpor(xtmp2, dst, xtmp2, vec_enc);
5881     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5882   }
5883 }
5884 
5885 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5886                                                 XMMRegister xtmp, Register rscratch) {
5887   assert(VM_Version::supports_gfni(), "");
5888   assert(rscratch != noreg || always_reachable(mask), "missing");
5889 
5890   // Galois field instruction based bit reversal based on following algorithm.
5891   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5892   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5893   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5894   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5895 }
5896 
5897 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5898                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5899   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5900   evpandq(dst, xtmp1, src, vec_enc);
5901   vpsllq(dst, dst, nbits, vec_enc);
5902   vpandn(xtmp1, xtmp1, src, vec_enc);
5903   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5904   evporq(dst, dst, xtmp1, vec_enc);
5905 }
5906 
5907 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5908                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5909   // Shift based bit reversal.
5910   assert(VM_Version::supports_evex(), "");
5911   switch(bt) {
5912     case T_LONG:
5913       // Swap upper and lower double word of each quad word.
5914       evprorq(xtmp1, k0, src, 32, true, vec_enc);
5915       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5916       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5917       break;
5918     case T_INT:
5919       // Swap upper and lower word of each double word.
5920       evprord(xtmp1, k0, src, 16, true, vec_enc);
5921       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5922       break;
5923     case T_CHAR:
5924     case T_SHORT:
5925       // Swap upper and lower byte of each word.
5926       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
5927       break;
5928     case T_BYTE:
5929       evmovdquq(dst, k0, src, true, vec_enc);
5930       break;
5931     default:
5932       fatal("Unsupported type %s", type2name(bt));
5933       break;
5934   }
5935 }
5936 
5937 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5938   if (bt == T_BYTE) {
5939     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
5940       evmovdquq(dst, k0, src, true, vec_enc);
5941     } else {
5942       vmovdqu(dst, src);
5943     }
5944     return;
5945   }
5946   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
5947   // pre-computed shuffle indices.
5948   switch(bt) {
5949     case T_LONG:
5950       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
5951       break;
5952     case T_INT:
5953       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
5954       break;
5955     case T_CHAR:
5956     case T_SHORT:
5957       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
5958       break;
5959     default:
5960       fatal("Unsupported type %s", type2name(bt));
5961       break;
5962   }
5963   vpshufb(dst, src, dst, vec_enc);
5964 }
5965 
5966 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5967                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5968                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
5969   assert(is_integral_type(bt), "");
5970   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5971   assert(VM_Version::supports_avx512cd(), "");
5972   switch(bt) {
5973     case T_LONG:
5974       evplzcntq(dst, ktmp, src, merge, vec_enc);
5975       break;
5976     case T_INT:
5977       evplzcntd(dst, ktmp, src, merge, vec_enc);
5978       break;
5979     case T_SHORT:
5980       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
5981       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
5982       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
5983       vpunpckhwd(dst, xtmp1, src, vec_enc);
5984       evplzcntd(dst, ktmp, dst, merge, vec_enc);
5985       vpackusdw(dst, xtmp2, dst, vec_enc);
5986       break;
5987     case T_BYTE:
5988       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5989       // accessing the lookup table.
5990       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5991       // accessing the lookup table.
5992       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5993       assert(VM_Version::supports_avx512bw(), "");
5994       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
5995       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
5996       vpand(xtmp2, dst, src, vec_enc);
5997       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5998       vpsrlw(xtmp3, src, 4, vec_enc);
5999       vpand(xtmp3, dst, xtmp3, vec_enc);
6000       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6001       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6002       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6003       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6004       break;
6005     default:
6006       fatal("Unsupported type %s", type2name(bt));
6007       break;
6008   }
6009 }
6010 
6011 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6012                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6013   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6014   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6015   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6016   // accessing the lookup table.
6017   vpand(dst, xtmp2, src, vec_enc);
6018   vpshufb(dst, xtmp1, dst, vec_enc);
6019   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6020   // accessing the lookup table.
6021   vpsrlw(xtmp3, src, 4, vec_enc);
6022   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6023   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6024   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6025   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6026   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6027   vpaddb(dst, dst, xtmp2, vec_enc);
6028   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6029 }
6030 
6031 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6032                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6033   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6034   // Add zero counts of lower byte and upper byte of a word if
6035   // upper byte holds a zero value.
6036   vpsrlw(xtmp3, src, 8, vec_enc);
6037   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6038   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6039   vpsllw(xtmp2, dst, 8, vec_enc);
6040   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6041   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6042   vpsrlw(dst, dst, 8, vec_enc);
6043 }
6044 
6045 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6046                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6047   // Since IEEE 754 floating point format represents mantissa in 1.0 format
6048   // hence biased exponent can be used to compute leading zero count as per
6049   // following formula:-
6050   // LZCNT = 32 - (biased_exp - 127)
6051   // Special handling has been introduced for Zero, Max_Int and -ve source values.
6052 
6053   // Broadcast 0xFF
6054   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6055   vpsrld(xtmp1, xtmp1, 24, vec_enc);
6056 
6057   // Extract biased exponent.
6058   vcvtdq2ps(dst, src, vec_enc);
6059   vpsrld(dst, dst, 23, vec_enc);
6060   vpand(dst, dst, xtmp1, vec_enc);
6061 
6062   // Broadcast 127.
6063   vpsrld(xtmp1, xtmp1, 1, vec_enc);
6064   // Exponent = biased_exp - 127
6065   vpsubd(dst, dst, xtmp1, vec_enc);
6066 
6067   // Exponent = Exponent  + 1
6068   vpsrld(xtmp3, xtmp1, 6, vec_enc);
6069   vpaddd(dst, dst, xtmp3, vec_enc);
6070 
6071   // Replace -ve exponent with zero, exponent is -ve when src
6072   // lane contains a zero value.
6073   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6074   vblendvps(dst, dst, xtmp2, dst, vec_enc);
6075 
6076   // Rematerialize broadcast 32.
6077   vpslld(xtmp1, xtmp3, 5, vec_enc);
6078   // Exponent is 32 if corresponding source lane contains max_int value.
6079   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
6080   // LZCNT = 32 - exponent
6081   vpsubd(dst, xtmp1, dst, vec_enc);
6082 
6083   // Replace LZCNT with a value 1 if corresponding source lane
6084   // contains max_int value.
6085   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
6086 
6087   // Replace biased_exp with 0 if source lane value is less than zero.
6088   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6089   vblendvps(dst, dst, xtmp2, src, vec_enc);
6090 }
6091 
6092 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6093                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6094   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6095   // Add zero counts of lower word and upper word of a double word if
6096   // upper word holds a zero value.
6097   vpsrld(xtmp3, src, 16, vec_enc);
6098   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6099   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6100   vpslld(xtmp2, dst, 16, vec_enc);
6101   vpaddd(xtmp2, xtmp2, dst, vec_enc);
6102   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6103   vpsrld(dst, dst, 16, vec_enc);
6104   // Add zero counts of lower doubleword and upper doubleword of a
6105   // quadword if upper doubleword holds a zero value.
6106   vpsrlq(xtmp3, src, 32, vec_enc);
6107   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6108   vpsllq(xtmp2, dst, 32, vec_enc);
6109   vpaddq(xtmp2, xtmp2, dst, vec_enc);
6110   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6111   vpsrlq(dst, dst, 32, vec_enc);
6112 }
6113 
6114 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6115                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6116                                                        Register rtmp, int vec_enc) {
6117   assert(is_integral_type(bt), "unexpected type");
6118   assert(vec_enc < Assembler::AVX_512bit, "");
6119   switch(bt) {
6120     case T_LONG:
6121       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6122       break;
6123     case T_INT:
6124       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6125       break;
6126     case T_SHORT:
6127       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6128       break;
6129     case T_BYTE:
6130       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6131       break;
6132     default:
6133       fatal("Unsupported type %s", type2name(bt));
6134       break;
6135   }
6136 }
6137 
6138 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6139   switch(bt) {
6140     case T_BYTE:
6141       vpsubb(dst, src1, src2, vec_enc);
6142       break;
6143     case T_SHORT:
6144       vpsubw(dst, src1, src2, vec_enc);
6145       break;
6146     case T_INT:
6147       vpsubd(dst, src1, src2, vec_enc);
6148       break;
6149     case T_LONG:
6150       vpsubq(dst, src1, src2, vec_enc);
6151       break;
6152     default:
6153       fatal("Unsupported type %s", type2name(bt));
6154       break;
6155   }
6156 }
6157 
6158 // Trailing zero count computation is based on leading zero count operation as per
6159 // following equation. All AVX3 targets support AVX512CD feature which offers
6160 // direct vector instruction to compute leading zero count.
6161 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6162 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6163                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6164                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6165   assert(is_integral_type(bt), "");
6166   // xtmp = -1
6167   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6168   // xtmp = xtmp + src
6169   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6170   // xtmp = xtmp & ~src
6171   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6172   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6173   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6174   vpsub(bt, dst, xtmp4, dst, vec_enc);
6175 }
6176 
6177 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6178 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6179 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6180                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6181   assert(is_integral_type(bt), "");
6182   // xtmp = 0
6183   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6184   // xtmp = 0 - src
6185   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6186   // xtmp = xtmp | src
6187   vpor(xtmp3, xtmp3, src, vec_enc);
6188   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6189   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6190   vpsub(bt, dst, xtmp1, dst, vec_enc);
6191 }
6192 
6193 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6194   Label done;
6195   Label neg_divisor_fastpath;
6196   cmpl(divisor, 0);
6197   jccb(Assembler::less, neg_divisor_fastpath);
6198   xorl(rdx, rdx);
6199   divl(divisor);
6200   jmpb(done);
6201   bind(neg_divisor_fastpath);
6202   // Fastpath for divisor < 0:
6203   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6204   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6205   movl(rdx, rax);
6206   subl(rdx, divisor);
6207   if (VM_Version::supports_bmi1()) {
6208     andnl(rax, rdx, rax);
6209   } else {
6210     notl(rdx);
6211     andl(rax, rdx);
6212   }
6213   shrl(rax, 31);
6214   bind(done);
6215 }
6216 
6217 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6218   Label done;
6219   Label neg_divisor_fastpath;
6220   cmpl(divisor, 0);
6221   jccb(Assembler::less, neg_divisor_fastpath);
6222   xorl(rdx, rdx);
6223   divl(divisor);
6224   jmpb(done);
6225   bind(neg_divisor_fastpath);
6226   // Fastpath when divisor < 0:
6227   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6228   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6229   movl(rdx, rax);
6230   subl(rax, divisor);
6231   if (VM_Version::supports_bmi1()) {
6232     andnl(rax, rax, rdx);
6233   } else {
6234     notl(rax);
6235     andl(rax, rdx);
6236   }
6237   sarl(rax, 31);
6238   andl(rax, divisor);
6239   subl(rdx, rax);
6240   bind(done);
6241 }
6242 
6243 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6244   Label done;
6245   Label neg_divisor_fastpath;
6246 
6247   cmpl(divisor, 0);
6248   jccb(Assembler::less, neg_divisor_fastpath);
6249   xorl(rdx, rdx);
6250   divl(divisor);
6251   jmpb(done);
6252   bind(neg_divisor_fastpath);
6253   // Fastpath for divisor < 0:
6254   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6255   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6256   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6257   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6258   movl(rdx, rax);
6259   subl(rax, divisor);
6260   if (VM_Version::supports_bmi1()) {
6261     andnl(rax, rax, rdx);
6262   } else {
6263     notl(rax);
6264     andl(rax, rdx);
6265   }
6266   movl(tmp, rax);
6267   shrl(rax, 31); // quotient
6268   sarl(tmp, 31);
6269   andl(tmp, divisor);
6270   subl(rdx, tmp); // remainder
6271   bind(done);
6272 }
6273 
6274 #ifdef _LP64
6275 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6276                                  XMMRegister xtmp2, Register rtmp) {
6277   if(VM_Version::supports_gfni()) {
6278     // Galois field instruction based bit reversal based on following algorithm.
6279     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6280     mov64(rtmp, 0x8040201008040201L);
6281     movq(xtmp1, src);
6282     movq(xtmp2, rtmp);
6283     gf2p8affineqb(xtmp1, xtmp2, 0);
6284     movq(dst, xtmp1);
6285   } else {
6286     // Swap even and odd numbered bits.
6287     movl(rtmp, src);
6288     andl(rtmp, 0x55555555);
6289     shll(rtmp, 1);
6290     movl(dst, src);
6291     andl(dst, 0xAAAAAAAA);
6292     shrl(dst, 1);
6293     orl(dst, rtmp);
6294 
6295     // Swap LSB and MSB 2 bits of each nibble.
6296     movl(rtmp, dst);
6297     andl(rtmp, 0x33333333);
6298     shll(rtmp, 2);
6299     andl(dst, 0xCCCCCCCC);
6300     shrl(dst, 2);
6301     orl(dst, rtmp);
6302 
6303     // Swap LSB and MSB 4 bits of each byte.
6304     movl(rtmp, dst);
6305     andl(rtmp, 0x0F0F0F0F);
6306     shll(rtmp, 4);
6307     andl(dst, 0xF0F0F0F0);
6308     shrl(dst, 4);
6309     orl(dst, rtmp);
6310   }
6311   bswapl(dst);
6312 }
6313 
6314 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6315                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6316   if(VM_Version::supports_gfni()) {
6317     // Galois field instruction based bit reversal based on following algorithm.
6318     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6319     mov64(rtmp1, 0x8040201008040201L);
6320     movq(xtmp1, src);
6321     movq(xtmp2, rtmp1);
6322     gf2p8affineqb(xtmp1, xtmp2, 0);
6323     movq(dst, xtmp1);
6324   } else {
6325     // Swap even and odd numbered bits.
6326     movq(rtmp1, src);
6327     mov64(rtmp2, 0x5555555555555555L);
6328     andq(rtmp1, rtmp2);
6329     shlq(rtmp1, 1);
6330     movq(dst, src);
6331     notq(rtmp2);
6332     andq(dst, rtmp2);
6333     shrq(dst, 1);
6334     orq(dst, rtmp1);
6335 
6336     // Swap LSB and MSB 2 bits of each nibble.
6337     movq(rtmp1, dst);
6338     mov64(rtmp2, 0x3333333333333333L);
6339     andq(rtmp1, rtmp2);
6340     shlq(rtmp1, 2);
6341     notq(rtmp2);
6342     andq(dst, rtmp2);
6343     shrq(dst, 2);
6344     orq(dst, rtmp1);
6345 
6346     // Swap LSB and MSB 4 bits of each byte.
6347     movq(rtmp1, dst);
6348     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6349     andq(rtmp1, rtmp2);
6350     shlq(rtmp1, 4);
6351     notq(rtmp2);
6352     andq(dst, rtmp2);
6353     shrq(dst, 4);
6354     orq(dst, rtmp1);
6355   }
6356   bswapq(dst);
6357 }
6358 
6359 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6360   Label done;
6361   Label neg_divisor_fastpath;
6362   cmpq(divisor, 0);
6363   jccb(Assembler::less, neg_divisor_fastpath);
6364   xorl(rdx, rdx);
6365   divq(divisor);
6366   jmpb(done);
6367   bind(neg_divisor_fastpath);
6368   // Fastpath for divisor < 0:
6369   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6370   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6371   movq(rdx, rax);
6372   subq(rdx, divisor);
6373   if (VM_Version::supports_bmi1()) {
6374     andnq(rax, rdx, rax);
6375   } else {
6376     notq(rdx);
6377     andq(rax, rdx);
6378   }
6379   shrq(rax, 63);
6380   bind(done);
6381 }
6382 
6383 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6384   Label done;
6385   Label neg_divisor_fastpath;
6386   cmpq(divisor, 0);
6387   jccb(Assembler::less, neg_divisor_fastpath);
6388   xorq(rdx, rdx);
6389   divq(divisor);
6390   jmp(done);
6391   bind(neg_divisor_fastpath);
6392   // Fastpath when divisor < 0:
6393   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6394   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6395   movq(rdx, rax);
6396   subq(rax, divisor);
6397   if (VM_Version::supports_bmi1()) {
6398     andnq(rax, rax, rdx);
6399   } else {
6400     notq(rax);
6401     andq(rax, rdx);
6402   }
6403   sarq(rax, 63);
6404   andq(rax, divisor);
6405   subq(rdx, rax);
6406   bind(done);
6407 }
6408 
6409 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6410   Label done;
6411   Label neg_divisor_fastpath;
6412   cmpq(divisor, 0);
6413   jccb(Assembler::less, neg_divisor_fastpath);
6414   xorq(rdx, rdx);
6415   divq(divisor);
6416   jmp(done);
6417   bind(neg_divisor_fastpath);
6418   // Fastpath for divisor < 0:
6419   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6420   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6421   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6422   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6423   movq(rdx, rax);
6424   subq(rax, divisor);
6425   if (VM_Version::supports_bmi1()) {
6426     andnq(rax, rax, rdx);
6427   } else {
6428     notq(rax);
6429     andq(rax, rdx);
6430   }
6431   movq(tmp, rax);
6432   shrq(rax, 63); // quotient
6433   sarq(tmp, 63);
6434   andq(tmp, divisor);
6435   subq(rdx, tmp); // remainder
6436   bind(done);
6437 }
6438 #endif
6439 
6440 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6441                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6442                                         int vlen_enc) {
6443   assert(VM_Version::supports_avx512bw(), "");
6444   // Byte shuffles are inlane operations and indices are determined using
6445   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6446   // normalized to index range 0-15. This makes sure that all the multiples
6447   // of an index value are placed at same relative position in 128 bit
6448   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6449   // will be 16th element in their respective 128 bit lanes.
6450   movl(rtmp, 16);
6451   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6452 
6453   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6454   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6455   // original shuffle indices and move the shuffled lanes corresponding to true
6456   // mask to destination vector.
6457   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6458   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6459   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6460 
6461   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6462   // and broadcasting second 128 bit lane.
6463   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6464   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6465   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6466   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6467   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6468 
6469   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6470   // and broadcasting third 128 bit lane.
6471   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6472   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6473   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6474   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6475   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6476 
6477   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6478   // and broadcasting third 128 bit lane.
6479   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6480   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6481   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6482   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6483   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6484 }
6485 
6486 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6487                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6488   if (vlen_enc == AVX_128bit) {
6489     vpermilps(dst, src, shuffle, vlen_enc);
6490   } else if (bt == T_INT) {
6491     vpermd(dst, shuffle, src, vlen_enc);
6492   } else {
6493     assert(bt == T_FLOAT, "");
6494     vpermps(dst, shuffle, src, vlen_enc);
6495   }
6496 }