1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/globals.hpp"
  37 #include "runtime/objectMonitor.hpp"
  38 #include "runtime/stubRoutines.hpp"
  39 #include "utilities/checkedCast.hpp"
  40 #include "utilities/globalDefinitions.hpp"
  41 #include "utilities/powerOfTwo.hpp"
  42 #include "utilities/sizes.hpp"
  43 
  44 #ifdef PRODUCT
  45 #define BLOCK_COMMENT(str) /* nothing */
  46 #define STOP(error) stop(error)
  47 #else
  48 #define BLOCK_COMMENT(str) block_comment(str)
  49 #define STOP(error) block_comment(error); stop(error)
  50 #endif
  51 
  52 // C2 compiled method's prolog code.
  53 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  54 
  55   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  56   // NativeJump::patch_verified_entry will be able to patch out the entry
  57   // code safely. The push to verify stack depth is ok at 5 bytes,
  58   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  59   // stack bang then we must use the 6 byte frame allocation even if
  60   // we have no frame. :-(
  61   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  62 
  63   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  64   // Remove word for return addr
  65   framesize -= wordSize;
  66   stack_bang_size -= wordSize;
  67 
  68   // Calls to C2R adapters often do not accept exceptional returns.
  69   // We require that their callers must bang for them.  But be careful, because
  70   // some VM calls (such as call site linkage) can use several kilobytes of
  71   // stack.  But the stack safety zone should account for that.
  72   // See bugs 4446381, 4468289, 4497237.
  73   if (stack_bang_size > 0) {
  74     generate_stack_overflow_check(stack_bang_size);
  75 
  76     // We always push rbp, so that on return to interpreter rbp, will be
  77     // restored correctly and we can correct the stack.
  78     push(rbp);
  79     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  80     if (PreserveFramePointer) {
  81       mov(rbp, rsp);
  82     }
  83     // Remove word for ebp
  84     framesize -= wordSize;
  85 
  86     // Create frame
  87     if (framesize) {
  88       subptr(rsp, framesize);
  89     }
  90   } else {
  91     // Create frame (force generation of a 4 byte immediate value)
  92     subptr_imm32(rsp, framesize);
  93 
  94     // Save RBP register now.
  95     framesize -= wordSize;
  96     movptr(Address(rsp, framesize), rbp);
  97     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  98     if (PreserveFramePointer) {
  99       movptr(rbp, rsp);
 100       if (framesize > 0) {
 101         addptr(rbp, framesize);
 102       }
 103     }
 104   }
 105 
 106   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 107     framesize -= wordSize;
 108     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 109   }
 110 
 111 #ifndef _LP64
 112   // If method sets FPU control word do it now
 113   if (fp_mode_24b) {
 114     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 115   }
 116   if (UseSSE >= 2 && VerifyFPU) {
 117     verify_FPU(0, "FPU stack must be clean on entry");
 118   }
 119 #endif
 120 
 121 #ifdef ASSERT
 122   if (VerifyStackAtCalls) {
 123     Label L;
 124     push(rax);
 125     mov(rax, rsp);
 126     andptr(rax, StackAlignmentInBytes-1);
 127     cmpptr(rax, StackAlignmentInBytes-wordSize);
 128     pop(rax);
 129     jcc(Assembler::equal, L);
 130     STOP("Stack is not properly aligned!");
 131     bind(L);
 132   }
 133 #endif
 134 
 135   if (!is_stub) {
 136     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 137  #ifdef _LP64
 138     if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
 139       // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 140       Label dummy_slow_path;
 141       Label dummy_continuation;
 142       Label* slow_path = &dummy_slow_path;
 143       Label* continuation = &dummy_continuation;
 144       if (!Compile::current()->output()->in_scratch_emit_size()) {
 145         // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 146         C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 147         Compile::current()->output()->add_stub(stub);
 148         slow_path = &stub->entry();
 149         continuation = &stub->continuation();
 150       }
 151       bs->nmethod_entry_barrier(this, slow_path, continuation);
 152     }
 153 #else
 154     // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 155     bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */);
 156 #endif
 157   }
 158 }
 159 
 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 161   switch (vlen_in_bytes) {
 162     case  4: // fall-through
 163     case  8: // fall-through
 164     case 16: return Assembler::AVX_128bit;
 165     case 32: return Assembler::AVX_256bit;
 166     case 64: return Assembler::AVX_512bit;
 167 
 168     default: {
 169       ShouldNotReachHere();
 170       return Assembler::AVX_NoVec;
 171     }
 172   }
 173 }
 174 
 175 #if INCLUDE_RTM_OPT
 176 
 177 // Update rtm_counters based on abort status
 178 // input: abort_status
 179 //        rtm_counters (RTMLockingCounters*)
 180 // flags are killed
 181 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
 182 
 183   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
 184   if (PrintPreciseRTMLockingStatistics) {
 185     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
 186       Label check_abort;
 187       testl(abort_status, (1<<i));
 188       jccb(Assembler::equal, check_abort);
 189       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
 190       bind(check_abort);
 191     }
 192   }
 193 }
 194 
 195 // Branch if (random & (count-1) != 0), count is 2^n
 196 // tmp, scr and flags are killed
 197 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
 198   assert(tmp == rax, "");
 199   assert(scr == rdx, "");
 200   rdtsc(); // modifies EDX:EAX
 201   andptr(tmp, count-1);
 202   jccb(Assembler::notZero, brLabel);
 203 }
 204 
 205 // Perform abort ratio calculation, set no_rtm bit if high ratio
 206 // input:  rtm_counters_Reg (RTMLockingCounters* address)
 207 // tmpReg, rtm_counters_Reg and flags are killed
 208 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
 209                                                     Register rtm_counters_Reg,
 210                                                     RTMLockingCounters* rtm_counters,
 211                                                     Metadata* method_data) {
 212   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 213 
 214   if (RTMLockingCalculationDelay > 0) {
 215     // Delay calculation
 216     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()));
 217     testptr(tmpReg, tmpReg);
 218     jccb(Assembler::equal, L_done);
 219   }
 220   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 221   //   Aborted transactions = abort_count * 100
 222   //   All transactions = total_count *  RTMTotalCountIncrRate
 223   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 224 
 225   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 226   cmpptr(tmpReg, RTMAbortThreshold);
 227   jccb(Assembler::below, L_check_always_rtm2);
 228   imulptr(tmpReg, tmpReg, 100);
 229 
 230   Register scrReg = rtm_counters_Reg;
 231   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 232   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 233   imulptr(scrReg, scrReg, RTMAbortRatio);
 234   cmpptr(tmpReg, scrReg);
 235   jccb(Assembler::below, L_check_always_rtm1);
 236   if (method_data != nullptr) {
 237     // set rtm_state to "no rtm" in MDO
 238     mov_metadata(tmpReg, method_data);
 239     lock();
 240     orl(Address(tmpReg, MethodData::rtm_state_offset()), NoRTM);
 241   }
 242   jmpb(L_done);
 243   bind(L_check_always_rtm1);
 244   // Reload RTMLockingCounters* address
 245   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 246   bind(L_check_always_rtm2);
 247   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 248   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 249   jccb(Assembler::below, L_done);
 250   if (method_data != nullptr) {
 251     // set rtm_state to "always rtm" in MDO
 252     mov_metadata(tmpReg, method_data);
 253     lock();
 254     orl(Address(tmpReg, MethodData::rtm_state_offset()), UseRTM);
 255   }
 256   bind(L_done);
 257 }
 258 
 259 // Update counters and perform abort ratio calculation
 260 // input:  abort_status_Reg
 261 // rtm_counters_Reg, flags are killed
 262 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 263                                       Register rtm_counters_Reg,
 264                                       RTMLockingCounters* rtm_counters,
 265                                       Metadata* method_data,
 266                                       bool profile_rtm) {
 267 
 268   assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 269   // update rtm counters based on rax value at abort
 270   // reads abort_status_Reg, updates flags
 271   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 272   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 273   if (profile_rtm) {
 274     // Save abort status because abort_status_Reg is used by following code.
 275     if (RTMRetryCount > 0) {
 276       push(abort_status_Reg);
 277     }
 278     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 279     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 280     // restore abort status
 281     if (RTMRetryCount > 0) {
 282       pop(abort_status_Reg);
 283     }
 284   }
 285 }
 286 
 287 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 288 // inputs: retry_count_Reg
 289 //       : abort_status_Reg
 290 // output: retry_count_Reg decremented by 1
 291 // flags are killed
 292 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 293   Label doneRetry;
 294   assert(abort_status_Reg == rax, "");
 295   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 296   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 297   // if reason is in 0x6 and retry count != 0 then retry
 298   andptr(abort_status_Reg, 0x6);
 299   jccb(Assembler::zero, doneRetry);
 300   testl(retry_count_Reg, retry_count_Reg);
 301   jccb(Assembler::zero, doneRetry);
 302   pause();
 303   decrementl(retry_count_Reg);
 304   jmp(retryLabel);
 305   bind(doneRetry);
 306 }
 307 
 308 // Spin and retry if lock is busy,
 309 // inputs: box_Reg (monitor address)
 310 //       : retry_count_Reg
 311 // output: retry_count_Reg decremented by 1
 312 //       : clear z flag if retry count exceeded
 313 // tmp_Reg, scr_Reg, flags are killed
 314 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 315                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 316   Label SpinLoop, SpinExit, doneRetry;
 317   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 318 
 319   testl(retry_count_Reg, retry_count_Reg);
 320   jccb(Assembler::zero, doneRetry);
 321   decrementl(retry_count_Reg);
 322   movptr(scr_Reg, RTMSpinLoopCount);
 323 
 324   bind(SpinLoop);
 325   pause();
 326   decrementl(scr_Reg);
 327   jccb(Assembler::lessEqual, SpinExit);
 328   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 329   testptr(tmp_Reg, tmp_Reg);
 330   jccb(Assembler::notZero, SpinLoop);
 331 
 332   bind(SpinExit);
 333   jmp(retryLabel);
 334   bind(doneRetry);
 335   incrementl(retry_count_Reg); // clear z flag
 336 }
 337 
 338 // Use RTM for normal stack locks
 339 // Input: objReg (object to lock)
 340 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 341                                          Register retry_on_abort_count_Reg,
 342                                          RTMLockingCounters* stack_rtm_counters,
 343                                          Metadata* method_data, bool profile_rtm,
 344                                          Label& DONE_LABEL, Label& IsInflated) {
 345   assert(UseRTMForStackLocks, "why call this otherwise?");
 346   assert(tmpReg == rax, "");
 347   assert(scrReg == rdx, "");
 348   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 349 
 350   if (RTMRetryCount > 0) {
 351     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 352     bind(L_rtm_retry);
 353   }
 354   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 355   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral
 356   jcc(Assembler::notZero, IsInflated);
 357 
 358   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 359     Label L_noincrement;
 360     if (RTMTotalCountIncrRate > 1) {
 361       // tmpReg, scrReg and flags are killed
 362       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 363     }
 364     assert(stack_rtm_counters != nullptr, "should not be null when profiling RTM");
 365     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 366     bind(L_noincrement);
 367   }
 368   xbegin(L_on_abort);
 369   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 370   andptr(tmpReg, markWord::lock_mask_in_place);     // look at 2 lock bits
 371   cmpptr(tmpReg, markWord::unlocked_value);         // bits = 01 unlocked
 372   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 373 
 374   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 375   if (UseRTMXendForLockBusy) {
 376     xend();
 377     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 378     jmp(L_decrement_retry);
 379   }
 380   else {
 381     xabort(0);
 382   }
 383   bind(L_on_abort);
 384   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 385     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 386   }
 387   bind(L_decrement_retry);
 388   if (RTMRetryCount > 0) {
 389     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 390     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 391   }
 392 }
 393 
 394 // Use RTM for inflating locks
 395 // inputs: objReg (object to lock)
 396 //         boxReg (on-stack box address (displaced header location) - KILLED)
 397 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 398 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 399                                             Register scrReg, Register retry_on_busy_count_Reg,
 400                                             Register retry_on_abort_count_Reg,
 401                                             RTMLockingCounters* rtm_counters,
 402                                             Metadata* method_data, bool profile_rtm,
 403                                             Label& DONE_LABEL) {
 404   assert(UseRTMLocking, "why call this otherwise?");
 405   assert(tmpReg == rax, "");
 406   assert(scrReg == rdx, "");
 407   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 408   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 409 
 410   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 411   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 412 
 413   if (RTMRetryCount > 0) {
 414     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 415     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 416     bind(L_rtm_retry);
 417   }
 418   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 419     Label L_noincrement;
 420     if (RTMTotalCountIncrRate > 1) {
 421       // tmpReg, scrReg and flags are killed
 422       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 423     }
 424     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 425     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 426     bind(L_noincrement);
 427   }
 428   xbegin(L_on_abort);
 429   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 430   movptr(tmpReg, Address(tmpReg, owner_offset));
 431   testptr(tmpReg, tmpReg);
 432   jcc(Assembler::zero, DONE_LABEL);
 433   if (UseRTMXendForLockBusy) {
 434     xend();
 435     jmp(L_decrement_retry);
 436   }
 437   else {
 438     xabort(0);
 439   }
 440   bind(L_on_abort);
 441   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 442   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 443     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 444   }
 445   if (RTMRetryCount > 0) {
 446     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 447     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 448   }
 449 
 450   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 451   testptr(tmpReg, tmpReg) ;
 452   jccb(Assembler::notZero, L_decrement_retry) ;
 453 
 454   // Appears unlocked - try to swing _owner from null to non-null.
 455   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 456 #ifdef _LP64
 457   Register threadReg = r15_thread;
 458 #else
 459   get_thread(scrReg);
 460   Register threadReg = scrReg;
 461 #endif
 462   lock();
 463   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 464 
 465   if (RTMRetryCount > 0) {
 466     // success done else retry
 467     jccb(Assembler::equal, DONE_LABEL) ;
 468     bind(L_decrement_retry);
 469     // Spin and retry if lock is busy.
 470     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 471   }
 472   else {
 473     bind(L_decrement_retry);
 474   }
 475 }
 476 
 477 #endif //  INCLUDE_RTM_OPT
 478 
 479 // fast_lock and fast_unlock used by C2
 480 
 481 // Because the transitions from emitted code to the runtime
 482 // monitorenter/exit helper stubs are so slow it's critical that
 483 // we inline both the stack-locking fast path and the inflated fast path.
 484 //
 485 // See also: cmpFastLock and cmpFastUnlock.
 486 //
 487 // What follows is a specialized inline transliteration of the code
 488 // in enter() and exit(). If we're concerned about I$ bloat another
 489 // option would be to emit TrySlowEnter and TrySlowExit methods
 490 // at startup-time.  These methods would accept arguments as
 491 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 492 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 493 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 494 // In practice, however, the # of lock sites is bounded and is usually small.
 495 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 496 // if the processor uses simple bimodal branch predictors keyed by EIP
 497 // Since the helper routines would be called from multiple synchronization
 498 // sites.
 499 //
 500 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 501 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 502 // to those specialized methods.  That'd give us a mostly platform-independent
 503 // implementation that the JITs could optimize and inline at their pleasure.
 504 // Done correctly, the only time we'd need to cross to native could would be
 505 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 506 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 507 // (b) explicit barriers or fence operations.
 508 //
 509 // TODO:
 510 //
 511 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 512 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 513 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 514 //    the lock operators would typically be faster than reifying Self.
 515 //
 516 // *  Ideally I'd define the primitives as:
 517 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 518 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 519 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 520 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 521 //    Furthermore the register assignments are overconstrained, possibly resulting in
 522 //    sub-optimal code near the synchronization site.
 523 //
 524 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 525 //    Alternately, use a better sp-proximity test.
 526 //
 527 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 528 //    Either one is sufficient to uniquely identify a thread.
 529 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 530 //
 531 // *  Intrinsify notify() and notifyAll() for the common cases where the
 532 //    object is locked by the calling thread but the waitlist is empty.
 533 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 534 //
 535 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 536 //    But beware of excessive branch density on AMD Opterons.
 537 //
 538 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 539 //    or failure of the fast path.  If the fast path fails then we pass
 540 //    control to the slow path, typically in C.  In fast_lock and
 541 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 542 //    will emit a conditional branch immediately after the node.
 543 //    So we have branches to branches and lots of ICC.ZF games.
 544 //    Instead, it might be better to have C2 pass a "FailureLabel"
 545 //    into fast_lock and fast_unlock.  In the case of success, control
 546 //    will drop through the node.  ICC.ZF is undefined at exit.
 547 //    In the case of failure, the node will branch directly to the
 548 //    FailureLabel
 549 
 550 
 551 // obj: object to lock
 552 // box: on-stack box address (displaced header location) - KILLED
 553 // rax,: tmp -- KILLED
 554 // scr: tmp -- KILLED
 555 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 556                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 557                                  RTMLockingCounters* rtm_counters,
 558                                  RTMLockingCounters* stack_rtm_counters,
 559                                  Metadata* method_data,
 560                                  bool use_rtm, bool profile_rtm) {
 561   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 562   // Ensure the register assignments are disjoint
 563   assert(tmpReg == rax, "");
 564 
 565   if (use_rtm) {
 566     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 567   } else {
 568     assert(cx1Reg == noreg, "");
 569     assert(cx2Reg == noreg, "");
 570     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 571   }
 572 
 573   // Possible cases that we'll encounter in fast_lock
 574   // ------------------------------------------------
 575   // * Inflated
 576   //    -- unlocked
 577   //    -- Locked
 578   //       = by self
 579   //       = by other
 580   // * neutral
 581   // * stack-locked
 582   //    -- by self
 583   //       = sp-proximity test hits
 584   //       = sp-proximity test generates false-negative
 585   //    -- by other
 586   //
 587 
 588   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 589 
 590   if (DiagnoseSyncOnValueBasedClasses != 0) {
 591     load_klass(tmpReg, objReg, scrReg);
 592     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 593     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 594     jcc(Assembler::notZero, DONE_LABEL);
 595   }
 596 
 597 #if INCLUDE_RTM_OPT
 598   if (UseRTMForStackLocks && use_rtm) {
 599     assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive");
 600     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 601                       stack_rtm_counters, method_data, profile_rtm,
 602                       DONE_LABEL, IsInflated);
 603   }
 604 #endif // INCLUDE_RTM_OPT
 605 
 606   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 607   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 608   jcc(Assembler::notZero, IsInflated);
 609 
 610   if (LockingMode == LM_MONITOR) {
 611     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 612     testptr(objReg, objReg);
 613   } else {
 614     assert(LockingMode == LM_LEGACY, "must be");
 615     // Attempt stack-locking ...
 616     orptr (tmpReg, markWord::unlocked_value);
 617     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 618     lock();
 619     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 620     jcc(Assembler::equal, COUNT);           // Success
 621 
 622     // Recursive locking.
 623     // The object is stack-locked: markword contains stack pointer to BasicLock.
 624     // Locked by current thread if difference with current SP is less than one page.
 625     subptr(tmpReg, rsp);
 626     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 627     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) );
 628     movptr(Address(boxReg, 0), tmpReg);
 629   }
 630   jmp(DONE_LABEL);
 631 
 632   bind(IsInflated);
 633   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 634 
 635 #if INCLUDE_RTM_OPT
 636   // Use the same RTM locking code in 32- and 64-bit VM.
 637   if (use_rtm) {
 638     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 639                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 640   } else {
 641 #endif // INCLUDE_RTM_OPT
 642 
 643 #ifndef _LP64
 644   // The object is inflated.
 645 
 646   // boxReg refers to the on-stack BasicLock in the current frame.
 647   // We'd like to write:
 648   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 649   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 650   // additional latency as we have another ST in the store buffer that must drain.
 651 
 652   // avoid ST-before-CAS
 653   // register juggle because we need tmpReg for cmpxchgptr below
 654   movptr(scrReg, boxReg);
 655   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 656 
 657   // Optimistic form: consider XORL tmpReg,tmpReg
 658   movptr(tmpReg, NULL_WORD);
 659 
 660   // Appears unlocked - try to swing _owner from null to non-null.
 661   // Ideally, I'd manifest "Self" with get_thread and then attempt
 662   // to CAS the register containing Self into m->Owner.
 663   // But we don't have enough registers, so instead we can either try to CAS
 664   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 665   // we later store "Self" into m->Owner.  Transiently storing a stack address
 666   // (rsp or the address of the box) into  m->owner is harmless.
 667   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 668   lock();
 669   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 670   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 671   // If we weren't able to swing _owner from null to the BasicLock
 672   // then take the slow path.
 673   jccb  (Assembler::notZero, NO_COUNT);
 674   // update _owner from BasicLock to thread
 675   get_thread (scrReg);                    // beware: clobbers ICCs
 676   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 677   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 678 
 679   // If the CAS fails we can either retry or pass control to the slow path.
 680   // We use the latter tactic.
 681   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 682   // If the CAS was successful ...
 683   //   Self has acquired the lock
 684   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 685   // Intentional fall-through into DONE_LABEL ...
 686 #else // _LP64
 687   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 688   movq(scrReg, tmpReg);
 689   xorq(tmpReg, tmpReg);
 690   lock();
 691   cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 692   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 693   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 694   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 695   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 696   jccb(Assembler::equal, COUNT);          // CAS above succeeded; propagate ZF = 1 (success)
 697 
 698   cmpptr(thread, rax);                // Check if we are already the owner (recursive lock)
 699   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 700   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 701   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 702 #endif // _LP64
 703 #if INCLUDE_RTM_OPT
 704   } // use_rtm()
 705 #endif
 706   bind(DONE_LABEL);
 707 
 708   // ZFlag == 1 count in fast path
 709   // ZFlag == 0 count in slow path
 710   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 711 
 712   bind(COUNT);
 713   // Count monitors in fast path
 714   increment(Address(thread, JavaThread::held_monitor_count_offset()));
 715 
 716   xorl(tmpReg, tmpReg); // Set ZF == 1
 717 
 718   bind(NO_COUNT);
 719 
 720   // At NO_COUNT the icc ZFlag is set as follows ...
 721   // fast_unlock uses the same protocol.
 722   // ZFlag == 1 -> Success
 723   // ZFlag == 0 -> Failure - force control through the slow path
 724 }
 725 
 726 // obj: object to unlock
 727 // box: box address (displaced header location), killed.  Must be EAX.
 728 // tmp: killed, cannot be obj nor box.
 729 //
 730 // Some commentary on balanced locking:
 731 //
 732 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 733 // Methods that don't have provably balanced locking are forced to run in the
 734 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 735 // The interpreter provides two properties:
 736 // I1:  At return-time the interpreter automatically and quietly unlocks any
 737 //      objects acquired the current activation (frame).  Recall that the
 738 //      interpreter maintains an on-stack list of locks currently held by
 739 //      a frame.
 740 // I2:  If a method attempts to unlock an object that is not held by the
 741 //      the frame the interpreter throws IMSX.
 742 //
 743 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 744 // B() doesn't have provably balanced locking so it runs in the interpreter.
 745 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 746 // is still locked by A().
 747 //
 748 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 749 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 750 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 751 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 752 // Arguably given that the spec legislates the JNI case as undefined our implementation
 753 // could reasonably *avoid* checking owner in fast_unlock().
 754 // In the interest of performance we elide m->Owner==Self check in unlock.
 755 // A perfectly viable alternative is to elide the owner check except when
 756 // Xcheck:jni is enabled.
 757 
 758 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 759   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 760   assert(boxReg == rax, "");
 761   assert_different_registers(objReg, boxReg, tmpReg);
 762 
 763   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 764 
 765 #if INCLUDE_RTM_OPT
 766   if (UseRTMForStackLocks && use_rtm) {
 767     assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive");
 768     Label L_regular_unlock;
 769     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 770     andptr(tmpReg, markWord::lock_mask_in_place);                     // look at 2 lock bits
 771     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 01 unlocked
 772     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 773     xend();                                                           // otherwise end...
 774     jmp(DONE_LABEL);                                                  // ... and we're done
 775     bind(L_regular_unlock);
 776   }
 777 #endif
 778 
 779   if (LockingMode == LM_LEGACY) {
 780     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 781     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 782   }
 783   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 784   if (LockingMode != LM_MONITOR) {
 785     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 786     jcc(Assembler::zero, Stacked);
 787   }
 788 
 789   // It's inflated.
 790 
 791 #if INCLUDE_RTM_OPT
 792   if (use_rtm) {
 793     Label L_regular_inflated_unlock;
 794     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 795     movptr(boxReg, Address(tmpReg, owner_offset));
 796     testptr(boxReg, boxReg);
 797     jccb(Assembler::notZero, L_regular_inflated_unlock);
 798     xend();
 799     jmp(DONE_LABEL);
 800     bind(L_regular_inflated_unlock);
 801   }
 802 #endif
 803 
 804   // Despite our balanced locking property we still check that m->_owner == Self
 805   // as java routines or native JNI code called by this thread might
 806   // have released the lock.
 807   // Refer to the comments in synchronizer.cpp for how we might encode extra
 808   // state in _succ so we can avoid fetching EntryList|cxq.
 809   //
 810   // If there's no contention try a 1-0 exit.  That is, exit without
 811   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 812   // we detect and recover from the race that the 1-0 exit admits.
 813   //
 814   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 815   // before it STs null into _owner, releasing the lock.  Updates
 816   // to data protected by the critical section must be visible before
 817   // we drop the lock (and thus before any other thread could acquire
 818   // the lock and observe the fields protected by the lock).
 819   // IA32's memory-model is SPO, so STs are ordered with respect to
 820   // each other and there's no need for an explicit barrier (fence).
 821   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 822 #ifndef _LP64
 823   // Note that we could employ various encoding schemes to reduce
 824   // the number of loads below (currently 4) to just 2 or 3.
 825   // Refer to the comments in synchronizer.cpp.
 826   // In practice the chain of fetches doesn't seem to impact performance, however.
 827   xorptr(boxReg, boxReg);
 828   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 829   jccb  (Assembler::notZero, DONE_LABEL);
 830   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 831   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 832   jccb  (Assembler::notZero, DONE_LABEL);
 833   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 834   jmpb  (DONE_LABEL);
 835 #else // _LP64
 836   // It's inflated
 837   Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath;
 838 
 839   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 840   jccb(Assembler::equal, LNotRecursive);
 841 
 842   // Recursive inflated unlock
 843   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 844   jmpb(LSuccess);
 845 
 846   bind(LNotRecursive);
 847   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 848   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 849   jccb  (Assembler::notZero, CheckSucc);
 850   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 851   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 852   jmpb  (DONE_LABEL);
 853 
 854   // Try to avoid passing control into the slow_path ...
 855   bind  (CheckSucc);
 856 
 857   // The following optional optimization can be elided if necessary
 858   // Effectively: if (succ == null) goto slow path
 859   // The code reduces the window for a race, however,
 860   // and thus benefits performance.
 861   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 862   jccb  (Assembler::zero, LGoSlowPath);
 863 
 864   xorptr(boxReg, boxReg);
 865   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 866   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 867 
 868   // Memory barrier/fence
 869   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 870   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 871   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 872   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 873   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 874   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 875   lock(); addl(Address(rsp, 0), 0);
 876 
 877   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 878   jccb  (Assembler::notZero, LSuccess);
 879 
 880   // Rare inopportune interleaving - race.
 881   // The successor vanished in the small window above.
 882   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 883   // We need to ensure progress and succession.
 884   // Try to reacquire the lock.
 885   // If that fails then the new owner is responsible for succession and this
 886   // thread needs to take no further action and can exit via the fast path (success).
 887   // If the re-acquire succeeds then pass control into the slow path.
 888   // As implemented, this latter mode is horrible because we generated more
 889   // coherence traffic on the lock *and* artificially extended the critical section
 890   // length while by virtue of passing control into the slow path.
 891 
 892   // box is really RAX -- the following CMPXCHG depends on that binding
 893   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 894   lock();
 895   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 896   // There's no successor so we tried to regrab the lock.
 897   // If that didn't work, then another thread grabbed the
 898   // lock so we're done (and exit was a success).
 899   jccb  (Assembler::notEqual, LSuccess);
 900   // Intentional fall-through into slow path
 901 
 902   bind  (LGoSlowPath);
 903   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 904   jmpb  (DONE_LABEL);
 905 
 906   bind  (LSuccess);
 907   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 908   jmpb  (DONE_LABEL);
 909 
 910 #endif
 911   if (LockingMode == LM_LEGACY) {
 912     bind  (Stacked);
 913     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 914     lock();
 915     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 916     // Intentional fall-thru into DONE_LABEL
 917   }
 918 
 919   bind(DONE_LABEL);
 920 
 921   // ZFlag == 1 count in fast path
 922   // ZFlag == 0 count in slow path
 923   jccb(Assembler::notZero, NO_COUNT);
 924 
 925   bind(COUNT);
 926   // Count monitors in fast path
 927 #ifndef _LP64
 928   get_thread(tmpReg);
 929   decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 930 #else // _LP64
 931   decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 932 #endif
 933 
 934   xorl(tmpReg, tmpReg); // Set ZF == 1
 935 
 936   bind(NO_COUNT);
 937 }
 938 
 939 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 940                                               Register t, Register thread) {
 941   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 942   assert(rax_reg == rax, "Used for CAS");
 943   assert_different_registers(obj, box, rax_reg, t, thread);
 944 
 945   // Handle inflated monitor.
 946   Label inflated;
 947   // Finish fast lock successfully. ZF value is irrelevant.
 948   Label locked;
 949   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 950   Label slow_path;
 951 
 952   if (DiagnoseSyncOnValueBasedClasses != 0) {
 953     load_klass(rax_reg, obj, t);
 954     movl(rax_reg, Address(rax_reg, Klass::access_flags_offset()));
 955     testl(rax_reg, JVM_ACC_IS_VALUE_BASED_CLASS);
 956     jcc(Assembler::notZero, slow_path);
 957   }
 958 
 959   const Register mark = t;
 960 
 961   { // Lightweight Lock
 962 
 963     Label push;
 964 
 965     const Register top = box;
 966 
 967     // Load the mark.
 968     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 969 
 970     // Prefetch top.
 971     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 972 
 973     // Check for monitor (0b10).
 974     testptr(mark, markWord::monitor_value);
 975     jcc(Assembler::notZero, inflated);
 976 
 977     // Check if lock-stack is full.
 978     cmpl(top, LockStack::end_offset() - 1);
 979     jcc(Assembler::greater, slow_path);
 980 
 981     // Check if recursive.
 982     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 983     jccb(Assembler::equal, push);
 984 
 985     // Try to lock. Transition lock bits 0b01 => 0b00
 986     movptr(rax_reg, mark);
 987     orptr(rax_reg, markWord::unlocked_value);
 988     andptr(mark, ~(int32_t)markWord::unlocked_value);
 989     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 990     jcc(Assembler::notEqual, slow_path);
 991 
 992     bind(push);
 993     // After successful lock, push object on lock-stack.
 994     movptr(Address(thread, top), obj);
 995     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 996     jmpb(locked);
 997   }
 998 
 999   { // Handle inflated monitor.
1000     bind(inflated);
1001 
1002     const Register tagged_monitor = mark;
1003 
1004     // CAS owner (null => current thread).
1005     xorptr(rax_reg, rax_reg);
1006     lock(); cmpxchgptr(thread, Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1007     jccb(Assembler::equal, locked);
1008 
1009     // Check if recursive.
1010     cmpptr(thread, rax_reg);
1011     jccb(Assembler::notEqual, slow_path);
1012 
1013     // Recursive.
1014     increment(Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1015   }
1016 
1017   bind(locked);
1018   increment(Address(thread, JavaThread::held_monitor_count_offset()));
1019   // Set ZF = 1
1020   xorl(rax_reg, rax_reg);
1021 
1022 #ifdef ASSERT
1023   // Check that locked label is reached with ZF set.
1024   Label zf_correct;
1025   Label zf_bad_zero;
1026   jcc(Assembler::zero, zf_correct);
1027   jmp(zf_bad_zero);
1028 #endif
1029 
1030   bind(slow_path);
1031 #ifdef ASSERT
1032   // Check that slow_path label is reached with ZF not set.
1033   jcc(Assembler::notZero, zf_correct);
1034   stop("Fast Lock ZF != 0");
1035   bind(zf_bad_zero);
1036   stop("Fast Lock ZF != 1");
1037   bind(zf_correct);
1038 #endif
1039   // C2 uses the value of ZF to determine the continuation.
1040 }
1041 
1042 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
1043   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
1044   assert(reg_rax == rax, "Used for CAS");
1045   assert_different_registers(obj, reg_rax, t);
1046 
1047   // Handle inflated monitor.
1048   Label inflated, inflated_check_lock_stack;
1049   // Finish fast unlock successfully.  MUST jump with ZF == 1
1050   Label unlocked;
1051 
1052   // Assume success.
1053   decrement(Address(thread, JavaThread::held_monitor_count_offset()));
1054 
1055   const Register mark = t;
1056   const Register top = reg_rax;
1057 
1058   Label dummy;
1059   C2FastUnlockLightweightStub* stub = nullptr;
1060 
1061   if (!Compile::current()->output()->in_scratch_emit_size()) {
1062     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
1063     Compile::current()->output()->add_stub(stub);
1064   }
1065 
1066   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
1067   Label& check_successor = stub == nullptr ? dummy : stub->check_successor();
1068 
1069   { // Lightweight Unlock
1070 
1071     // Load top.
1072     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
1073 
1074     // Prefetch mark.
1075     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
1076 
1077     // Check if obj is top of lock-stack.
1078     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
1079     // Top of lock stack was not obj. Must be monitor.
1080     jcc(Assembler::notEqual, inflated_check_lock_stack);
1081 
1082     // Pop lock-stack.
1083     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
1084     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
1085 
1086     // Check if recursive.
1087     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
1088     jcc(Assembler::equal, unlocked);
1089 
1090     // We elide the monitor check, let the CAS fail instead.
1091 
1092     // Try to unlock. Transition lock bits 0b00 => 0b01
1093     movptr(reg_rax, mark);
1094     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
1095     orptr(mark, markWord::unlocked_value);
1096     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
1097     jcc(Assembler::notEqual, push_and_slow_path);
1098     jmp(unlocked);
1099   }
1100 
1101 
1102   { // Handle inflated monitor.
1103     bind(inflated_check_lock_stack);
1104 #ifdef ASSERT
1105     Label check_done;
1106     subl(top, oopSize);
1107     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
1108     jcc(Assembler::below, check_done);
1109     cmpptr(obj, Address(thread, top));
1110     jccb(Assembler::notEqual, inflated_check_lock_stack);
1111     stop("Fast Unlock lock on stack");
1112     bind(check_done);
1113     testptr(mark, markWord::monitor_value);
1114     jccb(Assembler::notZero, inflated);
1115     stop("Fast Unlock not monitor");
1116 #endif
1117 
1118     bind(inflated);
1119 
1120     // mark contains the tagged ObjectMonitor*.
1121     const Register monitor = mark;
1122 
1123 #ifndef _LP64
1124     // Check if recursive.
1125     xorptr(reg_rax, reg_rax);
1126     orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1127     jcc(Assembler::notZero, check_successor);
1128 
1129     // Check if the entry lists are empty.
1130     movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
1131     orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
1132     jcc(Assembler::notZero, check_successor);
1133 
1134     // Release lock.
1135     movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
1136 #else // _LP64
1137     Label recursive;
1138 
1139     // Check if recursive.
1140     cmpptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
1141     jccb(Assembler::notEqual, recursive);
1142 
1143     // Check if the entry lists are empty.
1144     movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
1145     orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
1146     jcc(Assembler::notZero, check_successor);
1147 
1148     // Release lock.
1149     movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
1150     jmpb(unlocked);
1151 
1152     // Recursive unlock.
1153     bind(recursive);
1154     decrement(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1155     xorl(t, t);
1156 #endif
1157   }
1158 
1159   bind(unlocked);
1160   if (stub != nullptr) {
1161     bind(stub->unlocked_continuation());
1162   }
1163 
1164 #ifdef ASSERT
1165   // Check that unlocked label is reached with ZF set.
1166   Label zf_correct;
1167   jcc(Assembler::zero, zf_correct);
1168   stop("Fast Unlock ZF != 1");
1169 #endif
1170 
1171   if (stub != nullptr) {
1172     bind(stub->slow_path_continuation());
1173   }
1174 #ifdef ASSERT
1175   // Check that stub->continuation() label is reached with ZF not set.
1176   jccb(Assembler::notZero, zf_correct);
1177   stop("Fast Unlock ZF != 0");
1178   bind(zf_correct);
1179 #endif
1180   // C2 uses the value of ZF to determine the continuation.
1181 }
1182 
1183 //-------------------------------------------------------------------------------------------
1184 // Generic instructions support for use in .ad files C2 code generation
1185 
1186 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
1187   if (dst != src) {
1188     movdqu(dst, src);
1189   }
1190   if (opcode == Op_AbsVD) {
1191     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
1192   } else {
1193     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
1194     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1195   }
1196 }
1197 
1198 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
1199   if (opcode == Op_AbsVD) {
1200     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
1201   } else {
1202     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
1203     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
1204   }
1205 }
1206 
1207 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
1208   if (dst != src) {
1209     movdqu(dst, src);
1210   }
1211   if (opcode == Op_AbsVF) {
1212     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
1213   } else {
1214     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
1215     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1216   }
1217 }
1218 
1219 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
1220   if (opcode == Op_AbsVF) {
1221     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
1222   } else {
1223     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
1224     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
1225   }
1226 }
1227 
1228 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
1229   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1230   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
1231 
1232   if (opcode == Op_MinV) {
1233     if (elem_bt == T_BYTE) {
1234       pminsb(dst, src);
1235     } else if (elem_bt == T_SHORT) {
1236       pminsw(dst, src);
1237     } else if (elem_bt == T_INT) {
1238       pminsd(dst, src);
1239     } else {
1240       assert(elem_bt == T_LONG, "required");
1241       assert(tmp == xmm0, "required");
1242       assert_different_registers(dst, src, tmp);
1243       movdqu(xmm0, dst);
1244       pcmpgtq(xmm0, src);
1245       blendvpd(dst, src);  // xmm0 as mask
1246     }
1247   } else { // opcode == Op_MaxV
1248     if (elem_bt == T_BYTE) {
1249       pmaxsb(dst, src);
1250     } else if (elem_bt == T_SHORT) {
1251       pmaxsw(dst, src);
1252     } else if (elem_bt == T_INT) {
1253       pmaxsd(dst, src);
1254     } else {
1255       assert(elem_bt == T_LONG, "required");
1256       assert(tmp == xmm0, "required");
1257       assert_different_registers(dst, src, tmp);
1258       movdqu(xmm0, src);
1259       pcmpgtq(xmm0, dst);
1260       blendvpd(dst, src);  // xmm0 as mask
1261     }
1262   }
1263 }
1264 
1265 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1266                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1267                                  int vlen_enc) {
1268   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1269 
1270   if (opcode == Op_MinV) {
1271     if (elem_bt == T_BYTE) {
1272       vpminsb(dst, src1, src2, vlen_enc);
1273     } else if (elem_bt == T_SHORT) {
1274       vpminsw(dst, src1, src2, vlen_enc);
1275     } else if (elem_bt == T_INT) {
1276       vpminsd(dst, src1, src2, vlen_enc);
1277     } else {
1278       assert(elem_bt == T_LONG, "required");
1279       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1280         vpminsq(dst, src1, src2, vlen_enc);
1281       } else {
1282         assert_different_registers(dst, src1, src2);
1283         vpcmpgtq(dst, src1, src2, vlen_enc);
1284         vblendvpd(dst, src1, src2, dst, vlen_enc);
1285       }
1286     }
1287   } else { // opcode == Op_MaxV
1288     if (elem_bt == T_BYTE) {
1289       vpmaxsb(dst, src1, src2, vlen_enc);
1290     } else if (elem_bt == T_SHORT) {
1291       vpmaxsw(dst, src1, src2, vlen_enc);
1292     } else if (elem_bt == T_INT) {
1293       vpmaxsd(dst, src1, src2, vlen_enc);
1294     } else {
1295       assert(elem_bt == T_LONG, "required");
1296       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1297         vpmaxsq(dst, src1, src2, vlen_enc);
1298       } else {
1299         assert_different_registers(dst, src1, src2);
1300         vpcmpgtq(dst, src1, src2, vlen_enc);
1301         vblendvpd(dst, src2, src1, dst, vlen_enc);
1302       }
1303     }
1304   }
1305 }
1306 
1307 // Float/Double min max
1308 
1309 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1310                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1311                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1312                                    int vlen_enc) {
1313   assert(UseAVX > 0, "required");
1314   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1315          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1316   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1317   assert_different_registers(a, tmp, atmp, btmp);
1318   assert_different_registers(b, tmp, atmp, btmp);
1319 
1320   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1321   bool is_double_word = is_double_word_type(elem_bt);
1322 
1323   /* Note on 'non-obvious' assembly sequence:
1324    *
1325    * While there are vminps/vmaxps instructions, there are two important differences between hardware
1326    * and Java on how they handle floats:
1327    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
1328    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
1329    *
1330    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
1331    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
1332    *                (only useful when signs differ, noop otherwise)
1333    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
1334 
1335    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
1336    *   btmp = (b < +0.0) ? a : b
1337    *   atmp = (b < +0.0) ? b : a
1338    *   Tmp  = Max_Float(atmp , btmp)
1339    *   Res  = (atmp == NaN) ? atmp : Tmp
1340    */
1341 
1342   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
1343   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
1344   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
1345   XMMRegister mask;
1346 
1347   if (!is_double_word && is_min) {
1348     mask = a;
1349     vblend = &MacroAssembler::vblendvps;
1350     vmaxmin = &MacroAssembler::vminps;
1351     vcmp = &MacroAssembler::vcmpps;
1352   } else if (!is_double_word && !is_min) {
1353     mask = b;
1354     vblend = &MacroAssembler::vblendvps;
1355     vmaxmin = &MacroAssembler::vmaxps;
1356     vcmp = &MacroAssembler::vcmpps;
1357   } else if (is_double_word && is_min) {
1358     mask = a;
1359     vblend = &MacroAssembler::vblendvpd;
1360     vmaxmin = &MacroAssembler::vminpd;
1361     vcmp = &MacroAssembler::vcmppd;
1362   } else {
1363     assert(is_double_word && !is_min, "sanity");
1364     mask = b;
1365     vblend = &MacroAssembler::vblendvpd;
1366     vmaxmin = &MacroAssembler::vmaxpd;
1367     vcmp = &MacroAssembler::vcmppd;
1368   }
1369 
1370   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1371   XMMRegister maxmin, scratch;
1372   if (dst == btmp) {
1373     maxmin = btmp;
1374     scratch = tmp;
1375   } else {
1376     maxmin = tmp;
1377     scratch = btmp;
1378   }
1379 
1380   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1381   if (precompute_mask && !is_double_word) {
1382     vpsrad(tmp, mask, 32, vlen_enc);
1383     mask = tmp;
1384   } else if (precompute_mask && is_double_word) {
1385     vpxor(tmp, tmp, tmp, vlen_enc);
1386     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1387     mask = tmp;
1388   }
1389 
1390   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1391   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1392   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1393   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1394   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1395 }
1396 
1397 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1398                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1399                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1400                                     int vlen_enc) {
1401   assert(UseAVX > 2, "required");
1402   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1403          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1404   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1405   assert_different_registers(dst, a, atmp, btmp);
1406   assert_different_registers(dst, b, atmp, btmp);
1407 
1408   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1409   bool is_double_word = is_double_word_type(elem_bt);
1410   bool merge = true;
1411 
1412   if (!is_double_word && is_min) {
1413     evpmovd2m(ktmp, a, vlen_enc);
1414     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1415     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1416     vminps(dst, atmp, btmp, vlen_enc);
1417     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1418     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1419   } else if (!is_double_word && !is_min) {
1420     evpmovd2m(ktmp, b, vlen_enc);
1421     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1422     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1423     vmaxps(dst, atmp, btmp, vlen_enc);
1424     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1425     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1426   } else if (is_double_word && is_min) {
1427     evpmovq2m(ktmp, a, vlen_enc);
1428     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1429     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1430     vminpd(dst, atmp, btmp, vlen_enc);
1431     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1432     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1433   } else {
1434     assert(is_double_word && !is_min, "sanity");
1435     evpmovq2m(ktmp, b, vlen_enc);
1436     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1437     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1438     vmaxpd(dst, atmp, btmp, vlen_enc);
1439     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1440     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1441   }
1442 }
1443 
1444 // Float/Double signum
1445 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1446   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1447 
1448   Label DONE_LABEL;
1449 
1450   if (opcode == Op_SignumF) {
1451     assert(UseSSE > 0, "required");
1452     ucomiss(dst, zero);
1453     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1454     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1455     movflt(dst, one);
1456     jcc(Assembler::above, DONE_LABEL);
1457     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1458   } else if (opcode == Op_SignumD) {
1459     assert(UseSSE > 1, "required");
1460     ucomisd(dst, zero);
1461     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1462     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1463     movdbl(dst, one);
1464     jcc(Assembler::above, DONE_LABEL);
1465     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1466   }
1467 
1468   bind(DONE_LABEL);
1469 }
1470 
1471 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1472   if (sign) {
1473     pmovsxbw(dst, src);
1474   } else {
1475     pmovzxbw(dst, src);
1476   }
1477 }
1478 
1479 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1480   if (sign) {
1481     vpmovsxbw(dst, src, vector_len);
1482   } else {
1483     vpmovzxbw(dst, src, vector_len);
1484   }
1485 }
1486 
1487 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1488   if (sign) {
1489     vpmovsxbd(dst, src, vector_len);
1490   } else {
1491     vpmovzxbd(dst, src, vector_len);
1492   }
1493 }
1494 
1495 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1496   if (sign) {
1497     vpmovsxwd(dst, src, vector_len);
1498   } else {
1499     vpmovzxwd(dst, src, vector_len);
1500   }
1501 }
1502 
1503 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1504                                      int shift, int vector_len) {
1505   if (opcode == Op_RotateLeftV) {
1506     if (etype == T_INT) {
1507       evprold(dst, src, shift, vector_len);
1508     } else {
1509       assert(etype == T_LONG, "expected type T_LONG");
1510       evprolq(dst, src, shift, vector_len);
1511     }
1512   } else {
1513     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1514     if (etype == T_INT) {
1515       evprord(dst, src, shift, vector_len);
1516     } else {
1517       assert(etype == T_LONG, "expected type T_LONG");
1518       evprorq(dst, src, shift, vector_len);
1519     }
1520   }
1521 }
1522 
1523 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1524                                      XMMRegister shift, int vector_len) {
1525   if (opcode == Op_RotateLeftV) {
1526     if (etype == T_INT) {
1527       evprolvd(dst, src, shift, vector_len);
1528     } else {
1529       assert(etype == T_LONG, "expected type T_LONG");
1530       evprolvq(dst, src, shift, vector_len);
1531     }
1532   } else {
1533     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1534     if (etype == T_INT) {
1535       evprorvd(dst, src, shift, vector_len);
1536     } else {
1537       assert(etype == T_LONG, "expected type T_LONG");
1538       evprorvq(dst, src, shift, vector_len);
1539     }
1540   }
1541 }
1542 
1543 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1544   if (opcode == Op_RShiftVI) {
1545     psrad(dst, shift);
1546   } else if (opcode == Op_LShiftVI) {
1547     pslld(dst, shift);
1548   } else {
1549     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1550     psrld(dst, shift);
1551   }
1552 }
1553 
1554 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1555   switch (opcode) {
1556     case Op_RShiftVI:  psrad(dst, shift); break;
1557     case Op_LShiftVI:  pslld(dst, shift); break;
1558     case Op_URShiftVI: psrld(dst, shift); break;
1559 
1560     default: assert(false, "%s", NodeClassNames[opcode]);
1561   }
1562 }
1563 
1564 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1565   if (opcode == Op_RShiftVI) {
1566     vpsrad(dst, nds, shift, vector_len);
1567   } else if (opcode == Op_LShiftVI) {
1568     vpslld(dst, nds, shift, vector_len);
1569   } else {
1570     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1571     vpsrld(dst, nds, shift, vector_len);
1572   }
1573 }
1574 
1575 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1576   switch (opcode) {
1577     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1578     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1579     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1580 
1581     default: assert(false, "%s", NodeClassNames[opcode]);
1582   }
1583 }
1584 
1585 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1586   switch (opcode) {
1587     case Op_RShiftVB:  // fall-through
1588     case Op_RShiftVS:  psraw(dst, shift); break;
1589 
1590     case Op_LShiftVB:  // fall-through
1591     case Op_LShiftVS:  psllw(dst, shift);   break;
1592 
1593     case Op_URShiftVS: // fall-through
1594     case Op_URShiftVB: psrlw(dst, shift);  break;
1595 
1596     default: assert(false, "%s", NodeClassNames[opcode]);
1597   }
1598 }
1599 
1600 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1601   switch (opcode) {
1602     case Op_RShiftVB:  // fall-through
1603     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1604 
1605     case Op_LShiftVB:  // fall-through
1606     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1607 
1608     case Op_URShiftVS: // fall-through
1609     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1610 
1611     default: assert(false, "%s", NodeClassNames[opcode]);
1612   }
1613 }
1614 
1615 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1616   switch (opcode) {
1617     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1618     case Op_LShiftVL:  psllq(dst, shift); break;
1619     case Op_URShiftVL: psrlq(dst, shift); break;
1620 
1621     default: assert(false, "%s", NodeClassNames[opcode]);
1622   }
1623 }
1624 
1625 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1626   if (opcode == Op_RShiftVL) {
1627     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1628   } else if (opcode == Op_LShiftVL) {
1629     psllq(dst, shift);
1630   } else {
1631     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1632     psrlq(dst, shift);
1633   }
1634 }
1635 
1636 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1637   switch (opcode) {
1638     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1639     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1640     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1641 
1642     default: assert(false, "%s", NodeClassNames[opcode]);
1643   }
1644 }
1645 
1646 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1647   if (opcode == Op_RShiftVL) {
1648     evpsraq(dst, nds, shift, vector_len);
1649   } else if (opcode == Op_LShiftVL) {
1650     vpsllq(dst, nds, shift, vector_len);
1651   } else {
1652     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1653     vpsrlq(dst, nds, shift, vector_len);
1654   }
1655 }
1656 
1657 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1658   switch (opcode) {
1659     case Op_RShiftVB:  // fall-through
1660     case Op_RShiftVS:  // fall-through
1661     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1662 
1663     case Op_LShiftVB:  // fall-through
1664     case Op_LShiftVS:  // fall-through
1665     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1666 
1667     case Op_URShiftVB: // fall-through
1668     case Op_URShiftVS: // fall-through
1669     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1670 
1671     default: assert(false, "%s", NodeClassNames[opcode]);
1672   }
1673 }
1674 
1675 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1676   switch (opcode) {
1677     case Op_RShiftVB:  // fall-through
1678     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1679 
1680     case Op_LShiftVB:  // fall-through
1681     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1682 
1683     case Op_URShiftVB: // fall-through
1684     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1685 
1686     default: assert(false, "%s", NodeClassNames[opcode]);
1687   }
1688 }
1689 
1690 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1691   assert(UseAVX >= 2, "required");
1692   switch (opcode) {
1693     case Op_RShiftVL: {
1694       if (UseAVX > 2) {
1695         assert(tmp == xnoreg, "not used");
1696         if (!VM_Version::supports_avx512vl()) {
1697           vlen_enc = Assembler::AVX_512bit;
1698         }
1699         evpsravq(dst, src, shift, vlen_enc);
1700       } else {
1701         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1702         vpsrlvq(dst, src, shift, vlen_enc);
1703         vpsrlvq(tmp, tmp, shift, vlen_enc);
1704         vpxor(dst, dst, tmp, vlen_enc);
1705         vpsubq(dst, dst, tmp, vlen_enc);
1706       }
1707       break;
1708     }
1709     case Op_LShiftVL: {
1710       assert(tmp == xnoreg, "not used");
1711       vpsllvq(dst, src, shift, vlen_enc);
1712       break;
1713     }
1714     case Op_URShiftVL: {
1715       assert(tmp == xnoreg, "not used");
1716       vpsrlvq(dst, src, shift, vlen_enc);
1717       break;
1718     }
1719     default: assert(false, "%s", NodeClassNames[opcode]);
1720   }
1721 }
1722 
1723 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1724 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1725   assert(opcode == Op_LShiftVB ||
1726          opcode == Op_RShiftVB ||
1727          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1728   bool sign = (opcode != Op_URShiftVB);
1729   assert(vector_len == 0, "required");
1730   vextendbd(sign, dst, src, 1);
1731   vpmovzxbd(vtmp, shift, 1);
1732   varshiftd(opcode, dst, dst, vtmp, 1);
1733   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1734   vextracti128_high(vtmp, dst);
1735   vpackusdw(dst, dst, vtmp, 0);
1736 }
1737 
1738 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1739 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1740   assert(opcode == Op_LShiftVB ||
1741          opcode == Op_RShiftVB ||
1742          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1743   bool sign = (opcode != Op_URShiftVB);
1744   int ext_vector_len = vector_len + 1;
1745   vextendbw(sign, dst, src, ext_vector_len);
1746   vpmovzxbw(vtmp, shift, ext_vector_len);
1747   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1748   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1749   if (vector_len == 0) {
1750     vextracti128_high(vtmp, dst);
1751     vpackuswb(dst, dst, vtmp, vector_len);
1752   } else {
1753     vextracti64x4_high(vtmp, dst);
1754     vpackuswb(dst, dst, vtmp, vector_len);
1755     vpermq(dst, dst, 0xD8, vector_len);
1756   }
1757 }
1758 
1759 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1760   switch(typ) {
1761     case T_BYTE:
1762       pinsrb(dst, val, idx);
1763       break;
1764     case T_SHORT:
1765       pinsrw(dst, val, idx);
1766       break;
1767     case T_INT:
1768       pinsrd(dst, val, idx);
1769       break;
1770     case T_LONG:
1771       pinsrq(dst, val, idx);
1772       break;
1773     default:
1774       assert(false,"Should not reach here.");
1775       break;
1776   }
1777 }
1778 
1779 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1780   switch(typ) {
1781     case T_BYTE:
1782       vpinsrb(dst, src, val, idx);
1783       break;
1784     case T_SHORT:
1785       vpinsrw(dst, src, val, idx);
1786       break;
1787     case T_INT:
1788       vpinsrd(dst, src, val, idx);
1789       break;
1790     case T_LONG:
1791       vpinsrq(dst, src, val, idx);
1792       break;
1793     default:
1794       assert(false,"Should not reach here.");
1795       break;
1796   }
1797 }
1798 
1799 #ifdef _LP64
1800 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt,
1801                                                 XMMRegister dst, Register base,
1802                                                 Register idx_base,
1803                                                 Register offset, Register mask,
1804                                                 Register mask_idx, Register rtmp,
1805                                                 int vlen_enc) {
1806   vpxor(dst, dst, dst, vlen_enc);
1807   if (elem_bt == T_SHORT) {
1808     for (int i = 0; i < 4; i++) {
1809       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1810       Label skip_load;
1811       btq(mask, mask_idx);
1812       jccb(Assembler::carryClear, skip_load);
1813       movl(rtmp, Address(idx_base, i * 4));
1814       if (offset != noreg) {
1815         addl(rtmp, offset);
1816       }
1817       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1818       bind(skip_load);
1819       incq(mask_idx);
1820     }
1821   } else {
1822     assert(elem_bt == T_BYTE, "");
1823     for (int i = 0; i < 8; i++) {
1824       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1825       Label skip_load;
1826       btq(mask, mask_idx);
1827       jccb(Assembler::carryClear, skip_load);
1828       movl(rtmp, Address(idx_base, i * 4));
1829       if (offset != noreg) {
1830         addl(rtmp, offset);
1831       }
1832       pinsrb(dst, Address(base, rtmp), i);
1833       bind(skip_load);
1834       incq(mask_idx);
1835     }
1836   }
1837 }
1838 #endif // _LP64
1839 
1840 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst,
1841                                          Register base, Register idx_base,
1842                                          Register offset, Register rtmp,
1843                                          int vlen_enc) {
1844   vpxor(dst, dst, dst, vlen_enc);
1845   if (elem_bt == T_SHORT) {
1846     for (int i = 0; i < 4; i++) {
1847       // dst[i] = src[offset + idx_base[i]]
1848       movl(rtmp, Address(idx_base, i * 4));
1849       if (offset != noreg) {
1850         addl(rtmp, offset);
1851       }
1852       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1853     }
1854   } else {
1855     assert(elem_bt == T_BYTE, "");
1856     for (int i = 0; i < 8; i++) {
1857       // dst[i] = src[offset + idx_base[i]]
1858       movl(rtmp, Address(idx_base, i * 4));
1859       if (offset != noreg) {
1860         addl(rtmp, offset);
1861       }
1862       pinsrb(dst, Address(base, rtmp), i);
1863     }
1864   }
1865 }
1866 
1867 /*
1868  * Gather using hybrid algorithm, first partially unroll scalar loop
1869  * to accumulate values from gather indices into a quad-word(64bit) slice.
1870  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1871  * permutation to place the slice into appropriate vector lane
1872  * locations in destination vector. Following pseudo code describes the
1873  * algorithm in detail:
1874  *
1875  * DST_VEC = ZERO_VEC
1876  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1877  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1878  * FOREACH_ITER:
1879  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1880  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1881  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1882  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1883  *
1884  * With each iteration, doubleword permute indices (0,1) corresponding
1885  * to gathered quadword gets right shifted by two lane positions.
1886  *
1887  */
1888 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1889                                         Register base, Register idx_base,
1890                                         Register offset, Register mask,
1891                                         XMMRegister xtmp1, XMMRegister xtmp2,
1892                                         XMMRegister temp_dst, Register rtmp,
1893                                         Register mask_idx, Register length,
1894                                         int vector_len, int vlen_enc) {
1895   Label GATHER8_LOOP;
1896   assert(is_subword_type(elem_ty), "");
1897   movl(length, vector_len);
1898   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1899   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1900   vallones(xtmp2, vlen_enc);
1901   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1902   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1903   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1904 
1905   bind(GATHER8_LOOP);
1906     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1907     if (mask == noreg) {
1908       vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc);
1909     } else {
1910       LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc));
1911     }
1912     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1913     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1914     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1915     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1916     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1917     vpor(dst, dst, temp_dst, vlen_enc);
1918     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1919     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1920     jcc(Assembler::notEqual, GATHER8_LOOP);
1921 }
1922 
1923 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1924   switch(typ) {
1925     case T_INT:
1926       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1927       break;
1928     case T_FLOAT:
1929       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1930       break;
1931     case T_LONG:
1932       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1933       break;
1934     case T_DOUBLE:
1935       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1936       break;
1937     default:
1938       assert(false,"Should not reach here.");
1939       break;
1940   }
1941 }
1942 
1943 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1944   switch(typ) {
1945     case T_INT:
1946       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1947       break;
1948     case T_FLOAT:
1949       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1950       break;
1951     case T_LONG:
1952       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1953       break;
1954     case T_DOUBLE:
1955       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1956       break;
1957     default:
1958       assert(false,"Should not reach here.");
1959       break;
1960   }
1961 }
1962 
1963 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1964   switch(typ) {
1965     case T_INT:
1966       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1967       break;
1968     case T_FLOAT:
1969       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1970       break;
1971     case T_LONG:
1972       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1973       break;
1974     case T_DOUBLE:
1975       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1976       break;
1977     default:
1978       assert(false,"Should not reach here.");
1979       break;
1980   }
1981 }
1982 
1983 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1984   if (vlen_in_bytes <= 16) {
1985     pxor (dst, dst);
1986     psubb(dst, src);
1987     switch (elem_bt) {
1988       case T_BYTE:   /* nothing to do */ break;
1989       case T_SHORT:  pmovsxbw(dst, dst); break;
1990       case T_INT:    pmovsxbd(dst, dst); break;
1991       case T_FLOAT:  pmovsxbd(dst, dst); break;
1992       case T_LONG:   pmovsxbq(dst, dst); break;
1993       case T_DOUBLE: pmovsxbq(dst, dst); break;
1994 
1995       default: assert(false, "%s", type2name(elem_bt));
1996     }
1997   } else {
1998     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1999     int vlen_enc = vector_length_encoding(vlen_in_bytes);
2000 
2001     vpxor (dst, dst, dst, vlen_enc);
2002     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
2003 
2004     switch (elem_bt) {
2005       case T_BYTE:   /* nothing to do */            break;
2006       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
2007       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
2008       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
2009       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
2010       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
2011 
2012       default: assert(false, "%s", type2name(elem_bt));
2013     }
2014   }
2015 }
2016 
2017 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
2018   if (novlbwdq) {
2019     vpmovsxbd(xtmp, src, vlen_enc);
2020     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
2021             Assembler::eq, true, vlen_enc, noreg);
2022   } else {
2023     vpxor(xtmp, xtmp, xtmp, vlen_enc);
2024     vpsubb(xtmp, xtmp, src, vlen_enc);
2025     evpmovb2m(dst, xtmp, vlen_enc);
2026   }
2027 }
2028 
2029 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
2030   switch (vlen_in_bytes) {
2031     case 4:  movdl(dst, src);   break;
2032     case 8:  movq(dst, src);    break;
2033     case 16: movdqu(dst, src);  break;
2034     case 32: vmovdqu(dst, src); break;
2035     case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
2036     default: ShouldNotReachHere();
2037   }
2038 }
2039 
2040 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
2041   assert(rscratch != noreg || always_reachable(src), "missing");
2042 
2043   if (reachable(src)) {
2044     load_vector(dst, as_Address(src), vlen_in_bytes);
2045   } else {
2046     lea(rscratch, src);
2047     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
2048   }
2049 }
2050 
2051 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
2052   int vlen_enc = vector_length_encoding(vlen);
2053   if (VM_Version::supports_avx()) {
2054     if (bt == T_LONG) {
2055       if (VM_Version::supports_avx2()) {
2056         vpbroadcastq(dst, src, vlen_enc);
2057       } else {
2058         vmovddup(dst, src, vlen_enc);
2059       }
2060     } else if (bt == T_DOUBLE) {
2061       if (vlen_enc != Assembler::AVX_128bit) {
2062         vbroadcastsd(dst, src, vlen_enc, noreg);
2063       } else {
2064         vmovddup(dst, src, vlen_enc);
2065       }
2066     } else {
2067       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
2068         vpbroadcastd(dst, src, vlen_enc);
2069       } else {
2070         vbroadcastss(dst, src, vlen_enc);
2071       }
2072     }
2073   } else if (VM_Version::supports_sse3()) {
2074     movddup(dst, src);
2075   } else {
2076     movq(dst, src);
2077     if (vlen == 16) {
2078       punpcklqdq(dst, dst);
2079     }
2080   }
2081 }
2082 
2083 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
2084   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
2085   int offset = exact_log2(type2aelembytes(bt)) << 6;
2086   if (is_floating_point_type(bt)) {
2087     offset += 128;
2088   }
2089   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
2090   load_vector(dst, addr, vlen_in_bytes);
2091 }
2092 
2093 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
2094 
2095 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
2096   int vector_len = Assembler::AVX_128bit;
2097 
2098   switch (opcode) {
2099     case Op_AndReductionV:  pand(dst, src); break;
2100     case Op_OrReductionV:   por (dst, src); break;
2101     case Op_XorReductionV:  pxor(dst, src); break;
2102     case Op_MinReductionV:
2103       switch (typ) {
2104         case T_BYTE:        pminsb(dst, src); break;
2105         case T_SHORT:       pminsw(dst, src); break;
2106         case T_INT:         pminsd(dst, src); break;
2107         case T_LONG:        assert(UseAVX > 2, "required");
2108                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
2109         default:            assert(false, "wrong type");
2110       }
2111       break;
2112     case Op_MaxReductionV:
2113       switch (typ) {
2114         case T_BYTE:        pmaxsb(dst, src); break;
2115         case T_SHORT:       pmaxsw(dst, src); break;
2116         case T_INT:         pmaxsd(dst, src); break;
2117         case T_LONG:        assert(UseAVX > 2, "required");
2118                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
2119         default:            assert(false, "wrong type");
2120       }
2121       break;
2122     case Op_AddReductionVF: addss(dst, src); break;
2123     case Op_AddReductionVD: addsd(dst, src); break;
2124     case Op_AddReductionVI:
2125       switch (typ) {
2126         case T_BYTE:        paddb(dst, src); break;
2127         case T_SHORT:       paddw(dst, src); break;
2128         case T_INT:         paddd(dst, src); break;
2129         default:            assert(false, "wrong type");
2130       }
2131       break;
2132     case Op_AddReductionVL: paddq(dst, src); break;
2133     case Op_MulReductionVF: mulss(dst, src); break;
2134     case Op_MulReductionVD: mulsd(dst, src); break;
2135     case Op_MulReductionVI:
2136       switch (typ) {
2137         case T_SHORT:       pmullw(dst, src); break;
2138         case T_INT:         pmulld(dst, src); break;
2139         default:            assert(false, "wrong type");
2140       }
2141       break;
2142     case Op_MulReductionVL: assert(UseAVX > 2, "required");
2143                             evpmullq(dst, dst, src, vector_len); break;
2144     default:                assert(false, "wrong opcode");
2145   }
2146 }
2147 
2148 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
2149   int vector_len = Assembler::AVX_256bit;
2150 
2151   switch (opcode) {
2152     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
2153     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
2154     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
2155     case Op_MinReductionV:
2156       switch (typ) {
2157         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
2158         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
2159         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
2160         case T_LONG:        assert(UseAVX > 2, "required");
2161                             vpminsq(dst, src1, src2, vector_len); break;
2162         default:            assert(false, "wrong type");
2163       }
2164       break;
2165     case Op_MaxReductionV:
2166       switch (typ) {
2167         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
2168         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
2169         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
2170         case T_LONG:        assert(UseAVX > 2, "required");
2171                             vpmaxsq(dst, src1, src2, vector_len); break;
2172         default:            assert(false, "wrong type");
2173       }
2174       break;
2175     case Op_AddReductionVI:
2176       switch (typ) {
2177         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
2178         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
2179         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
2180         default:            assert(false, "wrong type");
2181       }
2182       break;
2183     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
2184     case Op_MulReductionVI:
2185       switch (typ) {
2186         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
2187         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
2188         default:            assert(false, "wrong type");
2189       }
2190       break;
2191     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
2192     default:                assert(false, "wrong opcode");
2193   }
2194 }
2195 
2196 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
2197                                   XMMRegister dst, XMMRegister src,
2198                                   XMMRegister vtmp1, XMMRegister vtmp2) {
2199   switch (opcode) {
2200     case Op_AddReductionVF:
2201     case Op_MulReductionVF:
2202       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
2203       break;
2204 
2205     case Op_AddReductionVD:
2206     case Op_MulReductionVD:
2207       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
2208       break;
2209 
2210     default: assert(false, "wrong opcode");
2211   }
2212 }
2213 
2214 void C2_MacroAssembler::reduceB(int opcode, int vlen,
2215                              Register dst, Register src1, XMMRegister src2,
2216                              XMMRegister vtmp1, XMMRegister vtmp2) {
2217   switch (vlen) {
2218     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2219     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2220     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2221     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2222 
2223     default: assert(false, "wrong vector length");
2224   }
2225 }
2226 
2227 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
2228                              Register dst, Register src1, XMMRegister src2,
2229                              XMMRegister vtmp1, XMMRegister vtmp2) {
2230   switch (vlen) {
2231     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2232     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2233     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2234     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2235 
2236     default: assert(false, "wrong vector length");
2237   }
2238 }
2239 
2240 void C2_MacroAssembler::reduceS(int opcode, int vlen,
2241                              Register dst, Register src1, XMMRegister src2,
2242                              XMMRegister vtmp1, XMMRegister vtmp2) {
2243   switch (vlen) {
2244     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2245     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2246     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2247     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2248 
2249     default: assert(false, "wrong vector length");
2250   }
2251 }
2252 
2253 void C2_MacroAssembler::reduceI(int opcode, int vlen,
2254                              Register dst, Register src1, XMMRegister src2,
2255                              XMMRegister vtmp1, XMMRegister vtmp2) {
2256   switch (vlen) {
2257     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2258     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2259     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2260     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2261 
2262     default: assert(false, "wrong vector length");
2263   }
2264 }
2265 
2266 #ifdef _LP64
2267 void C2_MacroAssembler::reduceL(int opcode, int vlen,
2268                              Register dst, Register src1, XMMRegister src2,
2269                              XMMRegister vtmp1, XMMRegister vtmp2) {
2270   switch (vlen) {
2271     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2272     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2273     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2274 
2275     default: assert(false, "wrong vector length");
2276   }
2277 }
2278 #endif // _LP64
2279 
2280 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2281   switch (vlen) {
2282     case 2:
2283       assert(vtmp2 == xnoreg, "");
2284       reduce2F(opcode, dst, src, vtmp1);
2285       break;
2286     case 4:
2287       assert(vtmp2 == xnoreg, "");
2288       reduce4F(opcode, dst, src, vtmp1);
2289       break;
2290     case 8:
2291       reduce8F(opcode, dst, src, vtmp1, vtmp2);
2292       break;
2293     case 16:
2294       reduce16F(opcode, dst, src, vtmp1, vtmp2);
2295       break;
2296     default: assert(false, "wrong vector length");
2297   }
2298 }
2299 
2300 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2301   switch (vlen) {
2302     case 2:
2303       assert(vtmp2 == xnoreg, "");
2304       reduce2D(opcode, dst, src, vtmp1);
2305       break;
2306     case 4:
2307       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2308       break;
2309     case 8:
2310       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2311       break;
2312     default: assert(false, "wrong vector length");
2313   }
2314 }
2315 
2316 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2317   if (opcode == Op_AddReductionVI) {
2318     if (vtmp1 != src2) {
2319       movdqu(vtmp1, src2);
2320     }
2321     phaddd(vtmp1, vtmp1);
2322   } else {
2323     pshufd(vtmp1, src2, 0x1);
2324     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2325   }
2326   movdl(vtmp2, src1);
2327   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2328   movdl(dst, vtmp1);
2329 }
2330 
2331 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2332   if (opcode == Op_AddReductionVI) {
2333     if (vtmp1 != src2) {
2334       movdqu(vtmp1, src2);
2335     }
2336     phaddd(vtmp1, src2);
2337     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2338   } else {
2339     pshufd(vtmp2, src2, 0xE);
2340     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2341     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2342   }
2343 }
2344 
2345 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2346   if (opcode == Op_AddReductionVI) {
2347     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2348     vextracti128_high(vtmp2, vtmp1);
2349     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2350     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2351   } else {
2352     vextracti128_high(vtmp1, src2);
2353     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2354     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2355   }
2356 }
2357 
2358 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2359   vextracti64x4_high(vtmp2, src2);
2360   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2361   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2362 }
2363 
2364 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2365   pshufd(vtmp2, src2, 0x1);
2366   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2367   movdqu(vtmp1, vtmp2);
2368   psrldq(vtmp1, 2);
2369   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2370   movdqu(vtmp2, vtmp1);
2371   psrldq(vtmp2, 1);
2372   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2373   movdl(vtmp2, src1);
2374   pmovsxbd(vtmp1, vtmp1);
2375   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2376   pextrb(dst, vtmp1, 0x0);
2377   movsbl(dst, dst);
2378 }
2379 
2380 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2381   pshufd(vtmp1, src2, 0xE);
2382   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2383   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2384 }
2385 
2386 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2387   vextracti128_high(vtmp2, src2);
2388   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2389   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2390 }
2391 
2392 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2393   vextracti64x4_high(vtmp1, src2);
2394   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2395   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2396 }
2397 
2398 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2399   pmovsxbw(vtmp2, src2);
2400   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2401 }
2402 
2403 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2404   if (UseAVX > 1) {
2405     int vector_len = Assembler::AVX_256bit;
2406     vpmovsxbw(vtmp1, src2, vector_len);
2407     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2408   } else {
2409     pmovsxbw(vtmp2, src2);
2410     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2411     pshufd(vtmp2, src2, 0x1);
2412     pmovsxbw(vtmp2, src2);
2413     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2414   }
2415 }
2416 
2417 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2418   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2419     int vector_len = Assembler::AVX_512bit;
2420     vpmovsxbw(vtmp1, src2, vector_len);
2421     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2422   } else {
2423     assert(UseAVX >= 2,"Should not reach here.");
2424     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2425     vextracti128_high(vtmp2, src2);
2426     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2427   }
2428 }
2429 
2430 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2431   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2432   vextracti64x4_high(vtmp2, src2);
2433   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2434 }
2435 
2436 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2437   if (opcode == Op_AddReductionVI) {
2438     if (vtmp1 != src2) {
2439       movdqu(vtmp1, src2);
2440     }
2441     phaddw(vtmp1, vtmp1);
2442     phaddw(vtmp1, vtmp1);
2443   } else {
2444     pshufd(vtmp2, src2, 0x1);
2445     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2446     movdqu(vtmp1, vtmp2);
2447     psrldq(vtmp1, 2);
2448     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2449   }
2450   movdl(vtmp2, src1);
2451   pmovsxwd(vtmp1, vtmp1);
2452   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2453   pextrw(dst, vtmp1, 0x0);
2454   movswl(dst, dst);
2455 }
2456 
2457 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2458   if (opcode == Op_AddReductionVI) {
2459     if (vtmp1 != src2) {
2460       movdqu(vtmp1, src2);
2461     }
2462     phaddw(vtmp1, src2);
2463   } else {
2464     pshufd(vtmp1, src2, 0xE);
2465     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2466   }
2467   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2468 }
2469 
2470 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2471   if (opcode == Op_AddReductionVI) {
2472     int vector_len = Assembler::AVX_256bit;
2473     vphaddw(vtmp2, src2, src2, vector_len);
2474     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2475   } else {
2476     vextracti128_high(vtmp2, src2);
2477     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2478   }
2479   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2480 }
2481 
2482 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2483   int vector_len = Assembler::AVX_256bit;
2484   vextracti64x4_high(vtmp1, src2);
2485   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2486   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2487 }
2488 
2489 #ifdef _LP64
2490 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2491   pshufd(vtmp2, src2, 0xE);
2492   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2493   movdq(vtmp1, src1);
2494   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2495   movdq(dst, vtmp1);
2496 }
2497 
2498 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2499   vextracti128_high(vtmp1, src2);
2500   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2501   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2502 }
2503 
2504 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2505   vextracti64x4_high(vtmp2, src2);
2506   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2507   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2508 }
2509 
2510 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2511   mov64(temp, -1L);
2512   bzhiq(temp, temp, len);
2513   kmovql(dst, temp);
2514 }
2515 #endif // _LP64
2516 
2517 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2518   reduce_operation_128(T_FLOAT, opcode, dst, src);
2519   pshufd(vtmp, src, 0x1);
2520   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2521 }
2522 
2523 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2524   reduce2F(opcode, dst, src, vtmp);
2525   pshufd(vtmp, src, 0x2);
2526   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2527   pshufd(vtmp, src, 0x3);
2528   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2529 }
2530 
2531 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2532   reduce4F(opcode, dst, src, vtmp2);
2533   vextractf128_high(vtmp2, src);
2534   reduce4F(opcode, dst, vtmp2, vtmp1);
2535 }
2536 
2537 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2538   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2539   vextracti64x4_high(vtmp1, src);
2540   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2541 }
2542 
2543 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2544   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2545   pshufd(vtmp, src, 0xE);
2546   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2547 }
2548 
2549 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2550   reduce2D(opcode, dst, src, vtmp2);
2551   vextractf128_high(vtmp2, src);
2552   reduce2D(opcode, dst, vtmp2, vtmp1);
2553 }
2554 
2555 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2556   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2557   vextracti64x4_high(vtmp1, src);
2558   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2559 }
2560 
2561 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2562   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2563 }
2564 
2565 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2566   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2567 }
2568 
2569 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2570                                  int vec_enc) {
2571   switch(elem_bt) {
2572     case T_INT:
2573     case T_FLOAT:
2574       vmaskmovps(dst, src, mask, vec_enc);
2575       break;
2576     case T_LONG:
2577     case T_DOUBLE:
2578       vmaskmovpd(dst, src, mask, vec_enc);
2579       break;
2580     default:
2581       fatal("Unsupported type %s", type2name(elem_bt));
2582       break;
2583   }
2584 }
2585 
2586 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2587                                  int vec_enc) {
2588   switch(elem_bt) {
2589     case T_INT:
2590     case T_FLOAT:
2591       vmaskmovps(dst, src, mask, vec_enc);
2592       break;
2593     case T_LONG:
2594     case T_DOUBLE:
2595       vmaskmovpd(dst, src, mask, vec_enc);
2596       break;
2597     default:
2598       fatal("Unsupported type %s", type2name(elem_bt));
2599       break;
2600   }
2601 }
2602 
2603 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2604                                           XMMRegister dst, XMMRegister src,
2605                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2606                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2607   const int permconst[] = {1, 14};
2608   XMMRegister wsrc = src;
2609   XMMRegister wdst = xmm_0;
2610   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2611 
2612   int vlen_enc = Assembler::AVX_128bit;
2613   if (vlen == 16) {
2614     vlen_enc = Assembler::AVX_256bit;
2615   }
2616 
2617   for (int i = log2(vlen) - 1; i >=0; i--) {
2618     if (i == 0 && !is_dst_valid) {
2619       wdst = dst;
2620     }
2621     if (i == 3) {
2622       vextracti64x4_high(wtmp, wsrc);
2623     } else if (i == 2) {
2624       vextracti128_high(wtmp, wsrc);
2625     } else { // i = [0,1]
2626       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2627     }
2628     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2629     wsrc = wdst;
2630     vlen_enc = Assembler::AVX_128bit;
2631   }
2632   if (is_dst_valid) {
2633     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2634   }
2635 }
2636 
2637 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2638                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2639                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2640   XMMRegister wsrc = src;
2641   XMMRegister wdst = xmm_0;
2642   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2643   int vlen_enc = Assembler::AVX_128bit;
2644   if (vlen == 8) {
2645     vlen_enc = Assembler::AVX_256bit;
2646   }
2647   for (int i = log2(vlen) - 1; i >=0; i--) {
2648     if (i == 0 && !is_dst_valid) {
2649       wdst = dst;
2650     }
2651     if (i == 1) {
2652       vextracti128_high(wtmp, wsrc);
2653     } else if (i == 2) {
2654       vextracti64x4_high(wtmp, wsrc);
2655     } else {
2656       assert(i == 0, "%d", i);
2657       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2658     }
2659     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2660     wsrc = wdst;
2661     vlen_enc = Assembler::AVX_128bit;
2662   }
2663   if (is_dst_valid) {
2664     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2665   }
2666 }
2667 
2668 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2669   switch (bt) {
2670     case T_BYTE:  pextrb(dst, src, idx); break;
2671     case T_SHORT: pextrw(dst, src, idx); break;
2672     case T_INT:   pextrd(dst, src, idx); break;
2673     case T_LONG:  pextrq(dst, src, idx); break;
2674 
2675     default:
2676       assert(false,"Should not reach here.");
2677       break;
2678   }
2679 }
2680 
2681 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2682   int esize =  type2aelembytes(typ);
2683   int elem_per_lane = 16/esize;
2684   int lane = elemindex / elem_per_lane;
2685   int eindex = elemindex % elem_per_lane;
2686 
2687   if (lane >= 2) {
2688     assert(UseAVX > 2, "required");
2689     vextractf32x4(dst, src, lane & 3);
2690     return dst;
2691   } else if (lane > 0) {
2692     assert(UseAVX > 0, "required");
2693     vextractf128(dst, src, lane);
2694     return dst;
2695   } else {
2696     return src;
2697   }
2698 }
2699 
2700 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2701   if (typ == T_BYTE) {
2702     movsbl(dst, dst);
2703   } else if (typ == T_SHORT) {
2704     movswl(dst, dst);
2705   }
2706 }
2707 
2708 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2709   int esize =  type2aelembytes(typ);
2710   int elem_per_lane = 16/esize;
2711   int eindex = elemindex % elem_per_lane;
2712   assert(is_integral_type(typ),"required");
2713 
2714   if (eindex == 0) {
2715     if (typ == T_LONG) {
2716       movq(dst, src);
2717     } else {
2718       movdl(dst, src);
2719       movsxl(typ, dst);
2720     }
2721   } else {
2722     extract(typ, dst, src, eindex);
2723     movsxl(typ, dst);
2724   }
2725 }
2726 
2727 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2728   int esize =  type2aelembytes(typ);
2729   int elem_per_lane = 16/esize;
2730   int eindex = elemindex % elem_per_lane;
2731   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2732 
2733   if (eindex == 0) {
2734     movq(dst, src);
2735   } else {
2736     if (typ == T_FLOAT) {
2737       if (UseAVX == 0) {
2738         movdqu(dst, src);
2739         shufps(dst, dst, eindex);
2740       } else {
2741         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2742       }
2743     } else {
2744       if (UseAVX == 0) {
2745         movdqu(dst, src);
2746         psrldq(dst, eindex*esize);
2747       } else {
2748         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2749       }
2750       movq(dst, dst);
2751     }
2752   }
2753   // Zero upper bits
2754   if (typ == T_FLOAT) {
2755     if (UseAVX == 0) {
2756       assert(vtmp != xnoreg, "required.");
2757       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2758       pand(dst, vtmp);
2759     } else {
2760       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2761     }
2762   }
2763 }
2764 
2765 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2766   switch(typ) {
2767     case T_BYTE:
2768     case T_BOOLEAN:
2769       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2770       break;
2771     case T_SHORT:
2772     case T_CHAR:
2773       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2774       break;
2775     case T_INT:
2776     case T_FLOAT:
2777       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2778       break;
2779     case T_LONG:
2780     case T_DOUBLE:
2781       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2782       break;
2783     default:
2784       assert(false,"Should not reach here.");
2785       break;
2786   }
2787 }
2788 
2789 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2790   assert(rscratch != noreg || always_reachable(src2), "missing");
2791 
2792   switch(typ) {
2793     case T_BOOLEAN:
2794     case T_BYTE:
2795       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2796       break;
2797     case T_CHAR:
2798     case T_SHORT:
2799       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2800       break;
2801     case T_INT:
2802     case T_FLOAT:
2803       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2804       break;
2805     case T_LONG:
2806     case T_DOUBLE:
2807       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2808       break;
2809     default:
2810       assert(false,"Should not reach here.");
2811       break;
2812   }
2813 }
2814 
2815 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2816   switch(typ) {
2817     case T_BYTE:
2818       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2819       break;
2820     case T_SHORT:
2821       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2822       break;
2823     case T_INT:
2824     case T_FLOAT:
2825       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2826       break;
2827     case T_LONG:
2828     case T_DOUBLE:
2829       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2830       break;
2831     default:
2832       assert(false,"Should not reach here.");
2833       break;
2834   }
2835 }
2836 
2837 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2838   assert(vlen_in_bytes <= 32, "");
2839   int esize = type2aelembytes(bt);
2840   if (vlen_in_bytes == 32) {
2841     assert(vtmp == xnoreg, "required.");
2842     if (esize >= 4) {
2843       vtestps(src1, src2, AVX_256bit);
2844     } else {
2845       vptest(src1, src2, AVX_256bit);
2846     }
2847     return;
2848   }
2849   if (vlen_in_bytes < 16) {
2850     // Duplicate the lower part to fill the whole register,
2851     // Don't need to do so for src2
2852     assert(vtmp != xnoreg, "required");
2853     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2854     pshufd(vtmp, src1, shuffle_imm);
2855   } else {
2856     assert(vtmp == xnoreg, "required");
2857     vtmp = src1;
2858   }
2859   if (esize >= 4 && VM_Version::supports_avx()) {
2860     vtestps(vtmp, src2, AVX_128bit);
2861   } else {
2862     ptest(vtmp, src2);
2863   }
2864 }
2865 
2866 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2867   assert(UseAVX >= 2, "required");
2868 #ifdef ASSERT
2869   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2870   bool is_bw_supported = VM_Version::supports_avx512bw();
2871   if (is_bw && !is_bw_supported) {
2872     assert(vlen_enc != Assembler::AVX_512bit, "required");
2873     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2874            "XMM register should be 0-15");
2875   }
2876 #endif // ASSERT
2877   switch (elem_bt) {
2878     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2879     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2880     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2881     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2882     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2883     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2884     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2885   }
2886 }
2887 
2888 #ifdef _LP64
2889 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2890   assert(UseAVX >= 2, "required");
2891   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2892   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2893   if ((UseAVX > 2) &&
2894       (!is_bw || VM_Version::supports_avx512bw()) &&
2895       (!is_vl || VM_Version::supports_avx512vl())) {
2896     switch (elem_bt) {
2897       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2898       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2899       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2900       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2901       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2902     }
2903   } else {
2904     assert(vlen_enc != Assembler::AVX_512bit, "required");
2905     assert((dst->encoding() < 16),"XMM register should be 0-15");
2906     switch (elem_bt) {
2907       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2908       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2909       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2910       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2911       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2912       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2913       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2914     }
2915   }
2916 }
2917 #endif
2918 
2919 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2920   switch (to_elem_bt) {
2921     case T_SHORT:
2922       vpmovsxbw(dst, src, vlen_enc);
2923       break;
2924     case T_INT:
2925       vpmovsxbd(dst, src, vlen_enc);
2926       break;
2927     case T_FLOAT:
2928       vpmovsxbd(dst, src, vlen_enc);
2929       vcvtdq2ps(dst, dst, vlen_enc);
2930       break;
2931     case T_LONG:
2932       vpmovsxbq(dst, src, vlen_enc);
2933       break;
2934     case T_DOUBLE: {
2935       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2936       vpmovsxbd(dst, src, mid_vlen_enc);
2937       vcvtdq2pd(dst, dst, vlen_enc);
2938       break;
2939     }
2940     default:
2941       fatal("Unsupported type %s", type2name(to_elem_bt));
2942       break;
2943   }
2944 }
2945 
2946 //-------------------------------------------------------------------------------------------
2947 
2948 // IndexOf for constant substrings with size >= 8 chars
2949 // which don't need to be loaded through stack.
2950 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2951                                          Register cnt1, Register cnt2,
2952                                          int int_cnt2,  Register result,
2953                                          XMMRegister vec, Register tmp,
2954                                          int ae) {
2955   ShortBranchVerifier sbv(this);
2956   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2957   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2958 
2959   // This method uses the pcmpestri instruction with bound registers
2960   //   inputs:
2961   //     xmm - substring
2962   //     rax - substring length (elements count)
2963   //     mem - scanned string
2964   //     rdx - string length (elements count)
2965   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2966   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2967   //   outputs:
2968   //     rcx - matched index in string
2969   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2970   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2971   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2972   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2973   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2974 
2975   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2976         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2977         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2978 
2979   // Note, inline_string_indexOf() generates checks:
2980   // if (substr.count > string.count) return -1;
2981   // if (substr.count == 0) return 0;
2982   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2983 
2984   // Load substring.
2985   if (ae == StrIntrinsicNode::UL) {
2986     pmovzxbw(vec, Address(str2, 0));
2987   } else {
2988     movdqu(vec, Address(str2, 0));
2989   }
2990   movl(cnt2, int_cnt2);
2991   movptr(result, str1); // string addr
2992 
2993   if (int_cnt2 > stride) {
2994     jmpb(SCAN_TO_SUBSTR);
2995 
2996     // Reload substr for rescan, this code
2997     // is executed only for large substrings (> 8 chars)
2998     bind(RELOAD_SUBSTR);
2999     if (ae == StrIntrinsicNode::UL) {
3000       pmovzxbw(vec, Address(str2, 0));
3001     } else {
3002       movdqu(vec, Address(str2, 0));
3003     }
3004     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
3005 
3006     bind(RELOAD_STR);
3007     // We came here after the beginning of the substring was
3008     // matched but the rest of it was not so we need to search
3009     // again. Start from the next element after the previous match.
3010 
3011     // cnt2 is number of substring reminding elements and
3012     // cnt1 is number of string reminding elements when cmp failed.
3013     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
3014     subl(cnt1, cnt2);
3015     addl(cnt1, int_cnt2);
3016     movl(cnt2, int_cnt2); // Now restore cnt2
3017 
3018     decrementl(cnt1);     // Shift to next element
3019     cmpl(cnt1, cnt2);
3020     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3021 
3022     addptr(result, (1<<scale1));
3023 
3024   } // (int_cnt2 > 8)
3025 
3026   // Scan string for start of substr in 16-byte vectors
3027   bind(SCAN_TO_SUBSTR);
3028   pcmpestri(vec, Address(result, 0), mode);
3029   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3030   subl(cnt1, stride);
3031   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3032   cmpl(cnt1, cnt2);
3033   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3034   addptr(result, 16);
3035   jmpb(SCAN_TO_SUBSTR);
3036 
3037   // Found a potential substr
3038   bind(FOUND_CANDIDATE);
3039   // Matched whole vector if first element matched (tmp(rcx) == 0).
3040   if (int_cnt2 == stride) {
3041     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
3042   } else { // int_cnt2 > 8
3043     jccb(Assembler::overflow, FOUND_SUBSTR);
3044   }
3045   // After pcmpestri tmp(rcx) contains matched element index
3046   // Compute start addr of substr
3047   lea(result, Address(result, tmp, scale1));
3048 
3049   // Make sure string is still long enough
3050   subl(cnt1, tmp);
3051   cmpl(cnt1, cnt2);
3052   if (int_cnt2 == stride) {
3053     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3054   } else { // int_cnt2 > 8
3055     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
3056   }
3057   // Left less then substring.
3058 
3059   bind(RET_NOT_FOUND);
3060   movl(result, -1);
3061   jmp(EXIT);
3062 
3063   if (int_cnt2 > stride) {
3064     // This code is optimized for the case when whole substring
3065     // is matched if its head is matched.
3066     bind(MATCH_SUBSTR_HEAD);
3067     pcmpestri(vec, Address(result, 0), mode);
3068     // Reload only string if does not match
3069     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
3070 
3071     Label CONT_SCAN_SUBSTR;
3072     // Compare the rest of substring (> 8 chars).
3073     bind(FOUND_SUBSTR);
3074     // First 8 chars are already matched.
3075     negptr(cnt2);
3076     addptr(cnt2, stride);
3077 
3078     bind(SCAN_SUBSTR);
3079     subl(cnt1, stride);
3080     cmpl(cnt2, -stride); // Do not read beyond substring
3081     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
3082     // Back-up strings to avoid reading beyond substring:
3083     // cnt1 = cnt1 - cnt2 + 8
3084     addl(cnt1, cnt2); // cnt2 is negative
3085     addl(cnt1, stride);
3086     movl(cnt2, stride); negptr(cnt2);
3087     bind(CONT_SCAN_SUBSTR);
3088     if (int_cnt2 < (int)G) {
3089       int tail_off1 = int_cnt2<<scale1;
3090       int tail_off2 = int_cnt2<<scale2;
3091       if (ae == StrIntrinsicNode::UL) {
3092         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
3093       } else {
3094         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
3095       }
3096       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
3097     } else {
3098       // calculate index in register to avoid integer overflow (int_cnt2*2)
3099       movl(tmp, int_cnt2);
3100       addptr(tmp, cnt2);
3101       if (ae == StrIntrinsicNode::UL) {
3102         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
3103       } else {
3104         movdqu(vec, Address(str2, tmp, scale2, 0));
3105       }
3106       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
3107     }
3108     // Need to reload strings pointers if not matched whole vector
3109     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3110     addptr(cnt2, stride);
3111     jcc(Assembler::negative, SCAN_SUBSTR);
3112     // Fall through if found full substring
3113 
3114   } // (int_cnt2 > 8)
3115 
3116   bind(RET_FOUND);
3117   // Found result if we matched full small substring.
3118   // Compute substr offset
3119   subptr(result, str1);
3120   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3121     shrl(result, 1); // index
3122   }
3123   bind(EXIT);
3124 
3125 } // string_indexofC8
3126 
3127 // Small strings are loaded through stack if they cross page boundary.
3128 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
3129                                        Register cnt1, Register cnt2,
3130                                        int int_cnt2,  Register result,
3131                                        XMMRegister vec, Register tmp,
3132                                        int ae) {
3133   ShortBranchVerifier sbv(this);
3134   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3135   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
3136 
3137   //
3138   // int_cnt2 is length of small (< 8 chars) constant substring
3139   // or (-1) for non constant substring in which case its length
3140   // is in cnt2 register.
3141   //
3142   // Note, inline_string_indexOf() generates checks:
3143   // if (substr.count > string.count) return -1;
3144   // if (substr.count == 0) return 0;
3145   //
3146   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
3147   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
3148   // This method uses the pcmpestri instruction with bound registers
3149   //   inputs:
3150   //     xmm - substring
3151   //     rax - substring length (elements count)
3152   //     mem - scanned string
3153   //     rdx - string length (elements count)
3154   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
3155   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
3156   //   outputs:
3157   //     rcx - matched index in string
3158   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3159   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
3160   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
3161   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
3162 
3163   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
3164         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
3165         FOUND_CANDIDATE;
3166 
3167   { //========================================================
3168     // We don't know where these strings are located
3169     // and we can't read beyond them. Load them through stack.
3170     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
3171 
3172     movptr(tmp, rsp); // save old SP
3173 
3174     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
3175       if (int_cnt2 == (1>>scale2)) { // One byte
3176         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
3177         load_unsigned_byte(result, Address(str2, 0));
3178         movdl(vec, result); // move 32 bits
3179       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
3180         // Not enough header space in 32-bit VM: 12+3 = 15.
3181         movl(result, Address(str2, -1));
3182         shrl(result, 8);
3183         movdl(vec, result); // move 32 bits
3184       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
3185         load_unsigned_short(result, Address(str2, 0));
3186         movdl(vec, result); // move 32 bits
3187       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
3188         movdl(vec, Address(str2, 0)); // move 32 bits
3189       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
3190         movq(vec, Address(str2, 0));  // move 64 bits
3191       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3192         // Array header size is 12 bytes in 32-bit VM
3193         // + 6 bytes for 3 chars == 18 bytes,
3194         // enough space to load vec and shift.
3195         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3196         if (ae == StrIntrinsicNode::UL) {
3197           int tail_off = int_cnt2-8;
3198           pmovzxbw(vec, Address(str2, tail_off));
3199           psrldq(vec, -2*tail_off);
3200         }
3201         else {
3202           int tail_off = int_cnt2*(1<<scale2);
3203           movdqu(vec, Address(str2, tail_off-16));
3204           psrldq(vec, 16-tail_off);
3205         }
3206       }
3207     } else { // not constant substring
3208       cmpl(cnt2, stride);
3209       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3210 
3211       // We can read beyond string if srt+16 does not cross page boundary
3212       // since heaps are aligned and mapped by pages.
3213       assert(os::vm_page_size() < (int)G, "default page should be small");
3214       movl(result, str2); // We need only low 32 bits
3215       andl(result, ((int)os::vm_page_size()-1));
3216       cmpl(result, ((int)os::vm_page_size()-16));
3217       jccb(Assembler::belowEqual, CHECK_STR);
3218 
3219       // Move small strings to stack to allow load 16 bytes into vec.
3220       subptr(rsp, 16);
3221       int stk_offset = wordSize-(1<<scale2);
3222       push(cnt2);
3223 
3224       bind(COPY_SUBSTR);
3225       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3226         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3227         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3228       } else if (ae == StrIntrinsicNode::UU) {
3229         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3230         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3231       }
3232       decrement(cnt2);
3233       jccb(Assembler::notZero, COPY_SUBSTR);
3234 
3235       pop(cnt2);
3236       movptr(str2, rsp);  // New substring address
3237     } // non constant
3238 
3239     bind(CHECK_STR);
3240     cmpl(cnt1, stride);
3241     jccb(Assembler::aboveEqual, BIG_STRINGS);
3242 
3243     // Check cross page boundary.
3244     movl(result, str1); // We need only low 32 bits
3245     andl(result, ((int)os::vm_page_size()-1));
3246     cmpl(result, ((int)os::vm_page_size()-16));
3247     jccb(Assembler::belowEqual, BIG_STRINGS);
3248 
3249     subptr(rsp, 16);
3250     int stk_offset = -(1<<scale1);
3251     if (int_cnt2 < 0) { // not constant
3252       push(cnt2);
3253       stk_offset += wordSize;
3254     }
3255     movl(cnt2, cnt1);
3256 
3257     bind(COPY_STR);
3258     if (ae == StrIntrinsicNode::LL) {
3259       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3260       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3261     } else {
3262       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3263       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3264     }
3265     decrement(cnt2);
3266     jccb(Assembler::notZero, COPY_STR);
3267 
3268     if (int_cnt2 < 0) { // not constant
3269       pop(cnt2);
3270     }
3271     movptr(str1, rsp);  // New string address
3272 
3273     bind(BIG_STRINGS);
3274     // Load substring.
3275     if (int_cnt2 < 0) { // -1
3276       if (ae == StrIntrinsicNode::UL) {
3277         pmovzxbw(vec, Address(str2, 0));
3278       } else {
3279         movdqu(vec, Address(str2, 0));
3280       }
3281       push(cnt2);       // substr count
3282       push(str2);       // substr addr
3283       push(str1);       // string addr
3284     } else {
3285       // Small (< 8 chars) constant substrings are loaded already.
3286       movl(cnt2, int_cnt2);
3287     }
3288     push(tmp);  // original SP
3289 
3290   } // Finished loading
3291 
3292   //========================================================
3293   // Start search
3294   //
3295 
3296   movptr(result, str1); // string addr
3297 
3298   if (int_cnt2  < 0) {  // Only for non constant substring
3299     jmpb(SCAN_TO_SUBSTR);
3300 
3301     // SP saved at sp+0
3302     // String saved at sp+1*wordSize
3303     // Substr saved at sp+2*wordSize
3304     // Substr count saved at sp+3*wordSize
3305 
3306     // Reload substr for rescan, this code
3307     // is executed only for large substrings (> 8 chars)
3308     bind(RELOAD_SUBSTR);
3309     movptr(str2, Address(rsp, 2*wordSize));
3310     movl(cnt2, Address(rsp, 3*wordSize));
3311     if (ae == StrIntrinsicNode::UL) {
3312       pmovzxbw(vec, Address(str2, 0));
3313     } else {
3314       movdqu(vec, Address(str2, 0));
3315     }
3316     // We came here after the beginning of the substring was
3317     // matched but the rest of it was not so we need to search
3318     // again. Start from the next element after the previous match.
3319     subptr(str1, result); // Restore counter
3320     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3321       shrl(str1, 1);
3322     }
3323     addl(cnt1, str1);
3324     decrementl(cnt1);   // Shift to next element
3325     cmpl(cnt1, cnt2);
3326     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3327 
3328     addptr(result, (1<<scale1));
3329   } // non constant
3330 
3331   // Scan string for start of substr in 16-byte vectors
3332   bind(SCAN_TO_SUBSTR);
3333   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3334   pcmpestri(vec, Address(result, 0), mode);
3335   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3336   subl(cnt1, stride);
3337   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3338   cmpl(cnt1, cnt2);
3339   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3340   addptr(result, 16);
3341 
3342   bind(ADJUST_STR);
3343   cmpl(cnt1, stride); // Do not read beyond string
3344   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3345   // Back-up string to avoid reading beyond string.
3346   lea(result, Address(result, cnt1, scale1, -16));
3347   movl(cnt1, stride);
3348   jmpb(SCAN_TO_SUBSTR);
3349 
3350   // Found a potential substr
3351   bind(FOUND_CANDIDATE);
3352   // After pcmpestri tmp(rcx) contains matched element index
3353 
3354   // Make sure string is still long enough
3355   subl(cnt1, tmp);
3356   cmpl(cnt1, cnt2);
3357   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3358   // Left less then substring.
3359 
3360   bind(RET_NOT_FOUND);
3361   movl(result, -1);
3362   jmp(CLEANUP);
3363 
3364   bind(FOUND_SUBSTR);
3365   // Compute start addr of substr
3366   lea(result, Address(result, tmp, scale1));
3367   if (int_cnt2 > 0) { // Constant substring
3368     // Repeat search for small substring (< 8 chars)
3369     // from new point without reloading substring.
3370     // Have to check that we don't read beyond string.
3371     cmpl(tmp, stride-int_cnt2);
3372     jccb(Assembler::greater, ADJUST_STR);
3373     // Fall through if matched whole substring.
3374   } else { // non constant
3375     assert(int_cnt2 == -1, "should be != 0");
3376 
3377     addl(tmp, cnt2);
3378     // Found result if we matched whole substring.
3379     cmpl(tmp, stride);
3380     jcc(Assembler::lessEqual, RET_FOUND);
3381 
3382     // Repeat search for small substring (<= 8 chars)
3383     // from new point 'str1' without reloading substring.
3384     cmpl(cnt2, stride);
3385     // Have to check that we don't read beyond string.
3386     jccb(Assembler::lessEqual, ADJUST_STR);
3387 
3388     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3389     // Compare the rest of substring (> 8 chars).
3390     movptr(str1, result);
3391 
3392     cmpl(tmp, cnt2);
3393     // First 8 chars are already matched.
3394     jccb(Assembler::equal, CHECK_NEXT);
3395 
3396     bind(SCAN_SUBSTR);
3397     pcmpestri(vec, Address(str1, 0), mode);
3398     // Need to reload strings pointers if not matched whole vector
3399     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3400 
3401     bind(CHECK_NEXT);
3402     subl(cnt2, stride);
3403     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3404     addptr(str1, 16);
3405     if (ae == StrIntrinsicNode::UL) {
3406       addptr(str2, 8);
3407     } else {
3408       addptr(str2, 16);
3409     }
3410     subl(cnt1, stride);
3411     cmpl(cnt2, stride); // Do not read beyond substring
3412     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3413     // Back-up strings to avoid reading beyond substring.
3414 
3415     if (ae == StrIntrinsicNode::UL) {
3416       lea(str2, Address(str2, cnt2, scale2, -8));
3417       lea(str1, Address(str1, cnt2, scale1, -16));
3418     } else {
3419       lea(str2, Address(str2, cnt2, scale2, -16));
3420       lea(str1, Address(str1, cnt2, scale1, -16));
3421     }
3422     subl(cnt1, cnt2);
3423     movl(cnt2, stride);
3424     addl(cnt1, stride);
3425     bind(CONT_SCAN_SUBSTR);
3426     if (ae == StrIntrinsicNode::UL) {
3427       pmovzxbw(vec, Address(str2, 0));
3428     } else {
3429       movdqu(vec, Address(str2, 0));
3430     }
3431     jmp(SCAN_SUBSTR);
3432 
3433     bind(RET_FOUND_LONG);
3434     movptr(str1, Address(rsp, wordSize));
3435   } // non constant
3436 
3437   bind(RET_FOUND);
3438   // Compute substr offset
3439   subptr(result, str1);
3440   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3441     shrl(result, 1); // index
3442   }
3443   bind(CLEANUP);
3444   pop(rsp); // restore SP
3445 
3446 } // string_indexof
3447 
3448 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3449                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3450   ShortBranchVerifier sbv(this);
3451   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3452 
3453   int stride = 8;
3454 
3455   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3456         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3457         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3458         FOUND_SEQ_CHAR, DONE_LABEL;
3459 
3460   movptr(result, str1);
3461   if (UseAVX >= 2) {
3462     cmpl(cnt1, stride);
3463     jcc(Assembler::less, SCAN_TO_CHAR);
3464     cmpl(cnt1, 2*stride);
3465     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3466     movdl(vec1, ch);
3467     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3468     vpxor(vec2, vec2);
3469     movl(tmp, cnt1);
3470     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3471     andl(cnt1,0x0000000F);  //tail count (in chars)
3472 
3473     bind(SCAN_TO_16_CHAR_LOOP);
3474     vmovdqu(vec3, Address(result, 0));
3475     vpcmpeqw(vec3, vec3, vec1, 1);
3476     vptest(vec2, vec3);
3477     jcc(Assembler::carryClear, FOUND_CHAR);
3478     addptr(result, 32);
3479     subl(tmp, 2*stride);
3480     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3481     jmp(SCAN_TO_8_CHAR);
3482     bind(SCAN_TO_8_CHAR_INIT);
3483     movdl(vec1, ch);
3484     pshuflw(vec1, vec1, 0x00);
3485     pshufd(vec1, vec1, 0);
3486     pxor(vec2, vec2);
3487   }
3488   bind(SCAN_TO_8_CHAR);
3489   cmpl(cnt1, stride);
3490   jcc(Assembler::less, SCAN_TO_CHAR);
3491   if (UseAVX < 2) {
3492     movdl(vec1, ch);
3493     pshuflw(vec1, vec1, 0x00);
3494     pshufd(vec1, vec1, 0);
3495     pxor(vec2, vec2);
3496   }
3497   movl(tmp, cnt1);
3498   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3499   andl(cnt1,0x00000007);  //tail count (in chars)
3500 
3501   bind(SCAN_TO_8_CHAR_LOOP);
3502   movdqu(vec3, Address(result, 0));
3503   pcmpeqw(vec3, vec1);
3504   ptest(vec2, vec3);
3505   jcc(Assembler::carryClear, FOUND_CHAR);
3506   addptr(result, 16);
3507   subl(tmp, stride);
3508   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3509   bind(SCAN_TO_CHAR);
3510   testl(cnt1, cnt1);
3511   jcc(Assembler::zero, RET_NOT_FOUND);
3512   bind(SCAN_TO_CHAR_LOOP);
3513   load_unsigned_short(tmp, Address(result, 0));
3514   cmpl(ch, tmp);
3515   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3516   addptr(result, 2);
3517   subl(cnt1, 1);
3518   jccb(Assembler::zero, RET_NOT_FOUND);
3519   jmp(SCAN_TO_CHAR_LOOP);
3520 
3521   bind(RET_NOT_FOUND);
3522   movl(result, -1);
3523   jmpb(DONE_LABEL);
3524 
3525   bind(FOUND_CHAR);
3526   if (UseAVX >= 2) {
3527     vpmovmskb(tmp, vec3);
3528   } else {
3529     pmovmskb(tmp, vec3);
3530   }
3531   bsfl(ch, tmp);
3532   addptr(result, ch);
3533 
3534   bind(FOUND_SEQ_CHAR);
3535   subptr(result, str1);
3536   shrl(result, 1);
3537 
3538   bind(DONE_LABEL);
3539 } // string_indexof_char
3540 
3541 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3542                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3543   ShortBranchVerifier sbv(this);
3544   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3545 
3546   int stride = 16;
3547 
3548   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3549         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3550         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3551         FOUND_SEQ_CHAR, DONE_LABEL;
3552 
3553   movptr(result, str1);
3554   if (UseAVX >= 2) {
3555     cmpl(cnt1, stride);
3556     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3557     cmpl(cnt1, stride*2);
3558     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3559     movdl(vec1, ch);
3560     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3561     vpxor(vec2, vec2);
3562     movl(tmp, cnt1);
3563     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3564     andl(cnt1,0x0000001F);  //tail count (in chars)
3565 
3566     bind(SCAN_TO_32_CHAR_LOOP);
3567     vmovdqu(vec3, Address(result, 0));
3568     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3569     vptest(vec2, vec3);
3570     jcc(Assembler::carryClear, FOUND_CHAR);
3571     addptr(result, 32);
3572     subl(tmp, stride*2);
3573     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3574     jmp(SCAN_TO_16_CHAR);
3575 
3576     bind(SCAN_TO_16_CHAR_INIT);
3577     movdl(vec1, ch);
3578     pxor(vec2, vec2);
3579     pshufb(vec1, vec2);
3580   }
3581 
3582   bind(SCAN_TO_16_CHAR);
3583   cmpl(cnt1, stride);
3584   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3585   if (UseAVX < 2) {
3586     movdl(vec1, ch);
3587     pxor(vec2, vec2);
3588     pshufb(vec1, vec2);
3589   }
3590   movl(tmp, cnt1);
3591   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3592   andl(cnt1,0x0000000F);  //tail count (in bytes)
3593 
3594   bind(SCAN_TO_16_CHAR_LOOP);
3595   movdqu(vec3, Address(result, 0));
3596   pcmpeqb(vec3, vec1);
3597   ptest(vec2, vec3);
3598   jcc(Assembler::carryClear, FOUND_CHAR);
3599   addptr(result, 16);
3600   subl(tmp, stride);
3601   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3602 
3603   bind(SCAN_TO_CHAR_INIT);
3604   testl(cnt1, cnt1);
3605   jcc(Assembler::zero, RET_NOT_FOUND);
3606   bind(SCAN_TO_CHAR_LOOP);
3607   load_unsigned_byte(tmp, Address(result, 0));
3608   cmpl(ch, tmp);
3609   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3610   addptr(result, 1);
3611   subl(cnt1, 1);
3612   jccb(Assembler::zero, RET_NOT_FOUND);
3613   jmp(SCAN_TO_CHAR_LOOP);
3614 
3615   bind(RET_NOT_FOUND);
3616   movl(result, -1);
3617   jmpb(DONE_LABEL);
3618 
3619   bind(FOUND_CHAR);
3620   if (UseAVX >= 2) {
3621     vpmovmskb(tmp, vec3);
3622   } else {
3623     pmovmskb(tmp, vec3);
3624   }
3625   bsfl(ch, tmp);
3626   addptr(result, ch);
3627 
3628   bind(FOUND_SEQ_CHAR);
3629   subptr(result, str1);
3630 
3631   bind(DONE_LABEL);
3632 } // stringL_indexof_char
3633 
3634 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3635   switch (eltype) {
3636   case T_BOOLEAN: return sizeof(jboolean);
3637   case T_BYTE:  return sizeof(jbyte);
3638   case T_SHORT: return sizeof(jshort);
3639   case T_CHAR:  return sizeof(jchar);
3640   case T_INT:   return sizeof(jint);
3641   default:
3642     ShouldNotReachHere();
3643     return -1;
3644   }
3645 }
3646 
3647 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3648   switch (eltype) {
3649   // T_BOOLEAN used as surrogate for unsigned byte
3650   case T_BOOLEAN: movzbl(dst, src);   break;
3651   case T_BYTE:    movsbl(dst, src);   break;
3652   case T_SHORT:   movswl(dst, src);   break;
3653   case T_CHAR:    movzwl(dst, src);   break;
3654   case T_INT:     movl(dst, src);     break;
3655   default:
3656     ShouldNotReachHere();
3657   }
3658 }
3659 
3660 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3661   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3662 }
3663 
3664 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3665   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3666 }
3667 
3668 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3669   const int vlen = Assembler::AVX_256bit;
3670   switch (eltype) {
3671   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3672   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3673   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3674   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3675   case T_INT:
3676     // do nothing
3677     break;
3678   default:
3679     ShouldNotReachHere();
3680   }
3681 }
3682 
3683 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3684                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3685                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3686                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3687                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3688                                         BasicType eltype) {
3689   ShortBranchVerifier sbv(this);
3690   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3691   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3692   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3693 
3694   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3695         SHORT_UNROLLED_LOOP_EXIT,
3696         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3697         UNROLLED_VECTOR_LOOP_BEGIN,
3698         END;
3699   switch (eltype) {
3700   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3701   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3702   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3703   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3704   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3705   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3706   }
3707 
3708   // For "renaming" for readibility of the code
3709   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3710                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3711                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3712 
3713   const int elsize = arrays_hashcode_elsize(eltype);
3714 
3715   /*
3716     if (cnt1 >= 2) {
3717       if (cnt1 >= 32) {
3718         UNROLLED VECTOR LOOP
3719       }
3720       UNROLLED SCALAR LOOP
3721     }
3722     SINGLE SCALAR
3723    */
3724 
3725   cmpl(cnt1, 32);
3726   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3727 
3728   // cnt1 >= 32 && generate_vectorized_loop
3729   xorl(index, index);
3730 
3731   // vresult = IntVector.zero(I256);
3732   for (int idx = 0; idx < 4; idx++) {
3733     vpxor(vresult[idx], vresult[idx]);
3734   }
3735   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3736   Register bound = tmp2;
3737   Register next = tmp3;
3738   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3739   movl(next, Address(tmp2, 0));
3740   movdl(vnext, next);
3741   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3742 
3743   // index = 0;
3744   // bound = cnt1 & ~(32 - 1);
3745   movl(bound, cnt1);
3746   andl(bound, ~(32 - 1));
3747   // for (; index < bound; index += 32) {
3748   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3749   // result *= next;
3750   imull(result, next);
3751   // loop fission to upfront the cost of fetching from memory, OOO execution
3752   // can then hopefully do a better job of prefetching
3753   for (int idx = 0; idx < 4; idx++) {
3754     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3755   }
3756   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3757   for (int idx = 0; idx < 4; idx++) {
3758     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3759     arrays_hashcode_elvcast(vtmp[idx], eltype);
3760     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3761   }
3762   // index += 32;
3763   addl(index, 32);
3764   // index < bound;
3765   cmpl(index, bound);
3766   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3767   // }
3768 
3769   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3770   subl(cnt1, bound);
3771   // release bound
3772 
3773   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3774   for (int idx = 0; idx < 4; idx++) {
3775     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3776     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3777     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3778   }
3779   // result += vresult.reduceLanes(ADD);
3780   for (int idx = 0; idx < 4; idx++) {
3781     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3782   }
3783 
3784   // } else if (cnt1 < 32) {
3785 
3786   bind(SHORT_UNROLLED_BEGIN);
3787   // int i = 1;
3788   movl(index, 1);
3789   cmpl(index, cnt1);
3790   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3791 
3792   // for (; i < cnt1 ; i += 2) {
3793   bind(SHORT_UNROLLED_LOOP_BEGIN);
3794   movl(tmp3, 961);
3795   imull(result, tmp3);
3796   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3797   movl(tmp3, tmp2);
3798   shll(tmp3, 5);
3799   subl(tmp3, tmp2);
3800   addl(result, tmp3);
3801   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3802   addl(result, tmp3);
3803   addl(index, 2);
3804   cmpl(index, cnt1);
3805   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3806 
3807   // }
3808   // if (i >= cnt1) {
3809   bind(SHORT_UNROLLED_LOOP_EXIT);
3810   jccb(Assembler::greater, END);
3811   movl(tmp2, result);
3812   shll(result, 5);
3813   subl(result, tmp2);
3814   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3815   addl(result, tmp3);
3816   // }
3817   bind(END);
3818 
3819   BLOCK_COMMENT("} // arrays_hashcode");
3820 
3821 } // arrays_hashcode
3822 
3823 // helper function for string_compare
3824 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3825                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3826                                            Address::ScaleFactor scale2, Register index, int ae) {
3827   if (ae == StrIntrinsicNode::LL) {
3828     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3829     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3830   } else if (ae == StrIntrinsicNode::UU) {
3831     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3832     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3833   } else {
3834     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3835     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3836   }
3837 }
3838 
3839 // Compare strings, used for char[] and byte[].
3840 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3841                                        Register cnt1, Register cnt2, Register result,
3842                                        XMMRegister vec1, int ae, KRegister mask) {
3843   ShortBranchVerifier sbv(this);
3844   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3845   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3846   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3847   int stride2x2 = 0x40;
3848   Address::ScaleFactor scale = Address::no_scale;
3849   Address::ScaleFactor scale1 = Address::no_scale;
3850   Address::ScaleFactor scale2 = Address::no_scale;
3851 
3852   if (ae != StrIntrinsicNode::LL) {
3853     stride2x2 = 0x20;
3854   }
3855 
3856   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3857     shrl(cnt2, 1);
3858   }
3859   // Compute the minimum of the string lengths and the
3860   // difference of the string lengths (stack).
3861   // Do the conditional move stuff
3862   movl(result, cnt1);
3863   subl(cnt1, cnt2);
3864   push(cnt1);
3865   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3866 
3867   // Is the minimum length zero?
3868   testl(cnt2, cnt2);
3869   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3870   if (ae == StrIntrinsicNode::LL) {
3871     // Load first bytes
3872     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3873     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3874   } else if (ae == StrIntrinsicNode::UU) {
3875     // Load first characters
3876     load_unsigned_short(result, Address(str1, 0));
3877     load_unsigned_short(cnt1, Address(str2, 0));
3878   } else {
3879     load_unsigned_byte(result, Address(str1, 0));
3880     load_unsigned_short(cnt1, Address(str2, 0));
3881   }
3882   subl(result, cnt1);
3883   jcc(Assembler::notZero,  POP_LABEL);
3884 
3885   if (ae == StrIntrinsicNode::UU) {
3886     // Divide length by 2 to get number of chars
3887     shrl(cnt2, 1);
3888   }
3889   cmpl(cnt2, 1);
3890   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3891 
3892   // Check if the strings start at the same location and setup scale and stride
3893   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3894     cmpptr(str1, str2);
3895     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3896     if (ae == StrIntrinsicNode::LL) {
3897       scale = Address::times_1;
3898       stride = 16;
3899     } else {
3900       scale = Address::times_2;
3901       stride = 8;
3902     }
3903   } else {
3904     scale1 = Address::times_1;
3905     scale2 = Address::times_2;
3906     // scale not used
3907     stride = 8;
3908   }
3909 
3910   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3911     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3912     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3913     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3914     Label COMPARE_TAIL_LONG;
3915     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3916 
3917     int pcmpmask = 0x19;
3918     if (ae == StrIntrinsicNode::LL) {
3919       pcmpmask &= ~0x01;
3920     }
3921 
3922     // Setup to compare 16-chars (32-bytes) vectors,
3923     // start from first character again because it has aligned address.
3924     if (ae == StrIntrinsicNode::LL) {
3925       stride2 = 32;
3926     } else {
3927       stride2 = 16;
3928     }
3929     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3930       adr_stride = stride << scale;
3931     } else {
3932       adr_stride1 = 8;  //stride << scale1;
3933       adr_stride2 = 16; //stride << scale2;
3934     }
3935 
3936     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3937     // rax and rdx are used by pcmpestri as elements counters
3938     movl(result, cnt2);
3939     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3940     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3941 
3942     // fast path : compare first 2 8-char vectors.
3943     bind(COMPARE_16_CHARS);
3944     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3945       movdqu(vec1, Address(str1, 0));
3946     } else {
3947       pmovzxbw(vec1, Address(str1, 0));
3948     }
3949     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3950     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3951 
3952     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3953       movdqu(vec1, Address(str1, adr_stride));
3954       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3955     } else {
3956       pmovzxbw(vec1, Address(str1, adr_stride1));
3957       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3958     }
3959     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3960     addl(cnt1, stride);
3961 
3962     // Compare the characters at index in cnt1
3963     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3964     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3965     subl(result, cnt2);
3966     jmp(POP_LABEL);
3967 
3968     // Setup the registers to start vector comparison loop
3969     bind(COMPARE_WIDE_VECTORS);
3970     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3971       lea(str1, Address(str1, result, scale));
3972       lea(str2, Address(str2, result, scale));
3973     } else {
3974       lea(str1, Address(str1, result, scale1));
3975       lea(str2, Address(str2, result, scale2));
3976     }
3977     subl(result, stride2);
3978     subl(cnt2, stride2);
3979     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3980     negptr(result);
3981 
3982     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3983     bind(COMPARE_WIDE_VECTORS_LOOP);
3984 
3985 #ifdef _LP64
3986     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3987       cmpl(cnt2, stride2x2);
3988       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3989       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3990       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3991 
3992       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3993       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3994         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3995         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3996       } else {
3997         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3998         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3999       }
4000       kortestql(mask, mask);
4001       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
4002       addptr(result, stride2x2);  // update since we already compared at this addr
4003       subl(cnt2, stride2x2);      // and sub the size too
4004       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4005 
4006       vpxor(vec1, vec1);
4007       jmpb(COMPARE_WIDE_TAIL);
4008     }//if (VM_Version::supports_avx512vlbw())
4009 #endif // _LP64
4010 
4011 
4012     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4013     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4014       vmovdqu(vec1, Address(str1, result, scale));
4015       vpxor(vec1, Address(str2, result, scale));
4016     } else {
4017       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
4018       vpxor(vec1, Address(str2, result, scale2));
4019     }
4020     vptest(vec1, vec1);
4021     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
4022     addptr(result, stride2);
4023     subl(cnt2, stride2);
4024     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
4025     // clean upper bits of YMM registers
4026     vpxor(vec1, vec1);
4027 
4028     // compare wide vectors tail
4029     bind(COMPARE_WIDE_TAIL);
4030     testptr(result, result);
4031     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
4032 
4033     movl(result, stride2);
4034     movl(cnt2, result);
4035     negptr(result);
4036     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4037 
4038     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
4039     bind(VECTOR_NOT_EQUAL);
4040     // clean upper bits of YMM registers
4041     vpxor(vec1, vec1);
4042     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4043       lea(str1, Address(str1, result, scale));
4044       lea(str2, Address(str2, result, scale));
4045     } else {
4046       lea(str1, Address(str1, result, scale1));
4047       lea(str2, Address(str2, result, scale2));
4048     }
4049     jmp(COMPARE_16_CHARS);
4050 
4051     // Compare tail chars, length between 1 to 15 chars
4052     bind(COMPARE_TAIL_LONG);
4053     movl(cnt2, result);
4054     cmpl(cnt2, stride);
4055     jcc(Assembler::less, COMPARE_SMALL_STR);
4056 
4057     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4058       movdqu(vec1, Address(str1, 0));
4059     } else {
4060       pmovzxbw(vec1, Address(str1, 0));
4061     }
4062     pcmpestri(vec1, Address(str2, 0), pcmpmask);
4063     jcc(Assembler::below, COMPARE_INDEX_CHAR);
4064     subptr(cnt2, stride);
4065     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
4066     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4067       lea(str1, Address(str1, result, scale));
4068       lea(str2, Address(str2, result, scale));
4069     } else {
4070       lea(str1, Address(str1, result, scale1));
4071       lea(str2, Address(str2, result, scale2));
4072     }
4073     negptr(cnt2);
4074     jmpb(WHILE_HEAD_LABEL);
4075 
4076     bind(COMPARE_SMALL_STR);
4077   } else if (UseSSE42Intrinsics) {
4078     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
4079     int pcmpmask = 0x19;
4080     // Setup to compare 8-char (16-byte) vectors,
4081     // start from first character again because it has aligned address.
4082     movl(result, cnt2);
4083     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
4084     if (ae == StrIntrinsicNode::LL) {
4085       pcmpmask &= ~0x01;
4086     }
4087     jcc(Assembler::zero, COMPARE_TAIL);
4088     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4089       lea(str1, Address(str1, result, scale));
4090       lea(str2, Address(str2, result, scale));
4091     } else {
4092       lea(str1, Address(str1, result, scale1));
4093       lea(str2, Address(str2, result, scale2));
4094     }
4095     negptr(result);
4096 
4097     // pcmpestri
4098     //   inputs:
4099     //     vec1- substring
4100     //     rax - negative string length (elements count)
4101     //     mem - scanned string
4102     //     rdx - string length (elements count)
4103     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
4104     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
4105     //   outputs:
4106     //     rcx - first mismatched element index
4107     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
4108 
4109     bind(COMPARE_WIDE_VECTORS);
4110     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4111       movdqu(vec1, Address(str1, result, scale));
4112       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
4113     } else {
4114       pmovzxbw(vec1, Address(str1, result, scale1));
4115       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
4116     }
4117     // After pcmpestri cnt1(rcx) contains mismatched element index
4118 
4119     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
4120     addptr(result, stride);
4121     subptr(cnt2, stride);
4122     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4123 
4124     // compare wide vectors tail
4125     testptr(result, result);
4126     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
4127 
4128     movl(cnt2, stride);
4129     movl(result, stride);
4130     negptr(result);
4131     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4132       movdqu(vec1, Address(str1, result, scale));
4133       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
4134     } else {
4135       pmovzxbw(vec1, Address(str1, result, scale1));
4136       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
4137     }
4138     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
4139 
4140     // Mismatched characters in the vectors
4141     bind(VECTOR_NOT_EQUAL);
4142     addptr(cnt1, result);
4143     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
4144     subl(result, cnt2);
4145     jmpb(POP_LABEL);
4146 
4147     bind(COMPARE_TAIL); // limit is zero
4148     movl(cnt2, result);
4149     // Fallthru to tail compare
4150   }
4151   // Shift str2 and str1 to the end of the arrays, negate min
4152   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4153     lea(str1, Address(str1, cnt2, scale));
4154     lea(str2, Address(str2, cnt2, scale));
4155   } else {
4156     lea(str1, Address(str1, cnt2, scale1));
4157     lea(str2, Address(str2, cnt2, scale2));
4158   }
4159   decrementl(cnt2);  // first character was compared already
4160   negptr(cnt2);
4161 
4162   // Compare the rest of the elements
4163   bind(WHILE_HEAD_LABEL);
4164   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
4165   subl(result, cnt1);
4166   jccb(Assembler::notZero, POP_LABEL);
4167   increment(cnt2);
4168   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
4169 
4170   // Strings are equal up to min length.  Return the length difference.
4171   bind(LENGTH_DIFF_LABEL);
4172   pop(result);
4173   if (ae == StrIntrinsicNode::UU) {
4174     // Divide diff by 2 to get number of chars
4175     sarl(result, 1);
4176   }
4177   jmpb(DONE_LABEL);
4178 
4179 #ifdef _LP64
4180   if (VM_Version::supports_avx512vlbw()) {
4181 
4182     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
4183 
4184     kmovql(cnt1, mask);
4185     notq(cnt1);
4186     bsfq(cnt2, cnt1);
4187     if (ae != StrIntrinsicNode::LL) {
4188       // Divide diff by 2 to get number of chars
4189       sarl(cnt2, 1);
4190     }
4191     addq(result, cnt2);
4192     if (ae == StrIntrinsicNode::LL) {
4193       load_unsigned_byte(cnt1, Address(str2, result));
4194       load_unsigned_byte(result, Address(str1, result));
4195     } else if (ae == StrIntrinsicNode::UU) {
4196       load_unsigned_short(cnt1, Address(str2, result, scale));
4197       load_unsigned_short(result, Address(str1, result, scale));
4198     } else {
4199       load_unsigned_short(cnt1, Address(str2, result, scale2));
4200       load_unsigned_byte(result, Address(str1, result, scale1));
4201     }
4202     subl(result, cnt1);
4203     jmpb(POP_LABEL);
4204   }//if (VM_Version::supports_avx512vlbw())
4205 #endif // _LP64
4206 
4207   // Discard the stored length difference
4208   bind(POP_LABEL);
4209   pop(cnt1);
4210 
4211   // That's it
4212   bind(DONE_LABEL);
4213   if(ae == StrIntrinsicNode::UL) {
4214     negl(result);
4215   }
4216 
4217 }
4218 
4219 // Search for Non-ASCII character (Negative byte value) in a byte array,
4220 // return the index of the first such character, otherwise the length
4221 // of the array segment searched.
4222 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4223 //   @IntrinsicCandidate
4224 //   public static int countPositives(byte[] ba, int off, int len) {
4225 //     for (int i = off; i < off + len; i++) {
4226 //       if (ba[i] < 0) {
4227 //         return i - off;
4228 //       }
4229 //     }
4230 //     return len;
4231 //   }
4232 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4233   Register result, Register tmp1,
4234   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4235   // rsi: byte array
4236   // rcx: len
4237   // rax: result
4238   ShortBranchVerifier sbv(this);
4239   assert_different_registers(ary1, len, result, tmp1);
4240   assert_different_registers(vec1, vec2);
4241   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4242 
4243   movl(result, len); // copy
4244   // len == 0
4245   testl(len, len);
4246   jcc(Assembler::zero, DONE);
4247 
4248   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4249     VM_Version::supports_avx512vlbw() &&
4250     VM_Version::supports_bmi2()) {
4251 
4252     Label test_64_loop, test_tail, BREAK_LOOP;
4253     movl(tmp1, len);
4254     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4255 
4256     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4257     andl(len,  0xffffffc0); // vector count (in chars)
4258     jccb(Assembler::zero, test_tail);
4259 
4260     lea(ary1, Address(ary1, len, Address::times_1));
4261     negptr(len);
4262 
4263     bind(test_64_loop);
4264     // Check whether our 64 elements of size byte contain negatives
4265     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4266     kortestql(mask1, mask1);
4267     jcc(Assembler::notZero, BREAK_LOOP);
4268 
4269     addptr(len, 64);
4270     jccb(Assembler::notZero, test_64_loop);
4271 
4272     bind(test_tail);
4273     // bail out when there is nothing to be done
4274     testl(tmp1, -1);
4275     jcc(Assembler::zero, DONE);
4276 
4277 
4278     // check the tail for absense of negatives
4279     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4280 #ifdef _LP64
4281     {
4282       Register tmp3_aliased = len;
4283       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4284       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4285       notq(tmp3_aliased);
4286       kmovql(mask2, tmp3_aliased);
4287     }
4288 #else
4289     Label k_init;
4290     jmp(k_init);
4291 
4292     // We could not read 64-bits from a general purpose register thus we move
4293     // data required to compose 64 1's to the instruction stream
4294     // We emit 64 byte wide series of elements from 0..63 which later on would
4295     // be used as a compare targets with tail count contained in tmp1 register.
4296     // Result would be a k register having tmp1 consecutive number or 1
4297     // counting from least significant bit.
4298     address tmp = pc();
4299     emit_int64(0x0706050403020100);
4300     emit_int64(0x0F0E0D0C0B0A0908);
4301     emit_int64(0x1716151413121110);
4302     emit_int64(0x1F1E1D1C1B1A1918);
4303     emit_int64(0x2726252423222120);
4304     emit_int64(0x2F2E2D2C2B2A2928);
4305     emit_int64(0x3736353433323130);
4306     emit_int64(0x3F3E3D3C3B3A3938);
4307 
4308     bind(k_init);
4309     lea(len, InternalAddress(tmp));
4310     // create mask to test for negative byte inside a vector
4311     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
4312     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
4313 
4314 #endif
4315     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4316     ktestq(mask1, mask2);
4317     jcc(Assembler::zero, DONE);
4318 
4319     // do a full check for negative registers in the tail
4320     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4321                      // ary1 already pointing to the right place
4322     jmpb(TAIL_START);
4323 
4324     bind(BREAK_LOOP);
4325     // At least one byte in the last 64 byte block was negative.
4326     // Set up to look at the last 64 bytes as if they were a tail
4327     lea(ary1, Address(ary1, len, Address::times_1));
4328     addptr(result, len);
4329     // Ignore the very last byte: if all others are positive,
4330     // it must be negative, so we can skip right to the 2+1 byte
4331     // end comparison at this point
4332     orl(result, 63);
4333     movl(len, 63);
4334     // Fallthru to tail compare
4335   } else {
4336 
4337     if (UseAVX >= 2 && UseSSE >= 2) {
4338       // With AVX2, use 32-byte vector compare
4339       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4340 
4341       // Compare 32-byte vectors
4342       testl(len, 0xffffffe0);   // vector count (in bytes)
4343       jccb(Assembler::zero, TAIL_START);
4344 
4345       andl(len, 0xffffffe0);
4346       lea(ary1, Address(ary1, len, Address::times_1));
4347       negptr(len);
4348 
4349       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4350       movdl(vec2, tmp1);
4351       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4352 
4353       bind(COMPARE_WIDE_VECTORS);
4354       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4355       vptest(vec1, vec2);
4356       jccb(Assembler::notZero, BREAK_LOOP);
4357       addptr(len, 32);
4358       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4359 
4360       testl(result, 0x0000001f);   // any bytes remaining?
4361       jcc(Assembler::zero, DONE);
4362 
4363       // Quick test using the already prepared vector mask
4364       movl(len, result);
4365       andl(len, 0x0000001f);
4366       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4367       vptest(vec1, vec2);
4368       jcc(Assembler::zero, DONE);
4369       // There are zeros, jump to the tail to determine exactly where
4370       jmpb(TAIL_START);
4371 
4372       bind(BREAK_LOOP);
4373       // At least one byte in the last 32-byte vector is negative.
4374       // Set up to look at the last 32 bytes as if they were a tail
4375       lea(ary1, Address(ary1, len, Address::times_1));
4376       addptr(result, len);
4377       // Ignore the very last byte: if all others are positive,
4378       // it must be negative, so we can skip right to the 2+1 byte
4379       // end comparison at this point
4380       orl(result, 31);
4381       movl(len, 31);
4382       // Fallthru to tail compare
4383     } else if (UseSSE42Intrinsics) {
4384       // With SSE4.2, use double quad vector compare
4385       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4386 
4387       // Compare 16-byte vectors
4388       testl(len, 0xfffffff0);   // vector count (in bytes)
4389       jcc(Assembler::zero, TAIL_START);
4390 
4391       andl(len, 0xfffffff0);
4392       lea(ary1, Address(ary1, len, Address::times_1));
4393       negptr(len);
4394 
4395       movl(tmp1, 0x80808080);
4396       movdl(vec2, tmp1);
4397       pshufd(vec2, vec2, 0);
4398 
4399       bind(COMPARE_WIDE_VECTORS);
4400       movdqu(vec1, Address(ary1, len, Address::times_1));
4401       ptest(vec1, vec2);
4402       jccb(Assembler::notZero, BREAK_LOOP);
4403       addptr(len, 16);
4404       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4405 
4406       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4407       jcc(Assembler::zero, DONE);
4408 
4409       // Quick test using the already prepared vector mask
4410       movl(len, result);
4411       andl(len, 0x0000000f);   // tail count (in bytes)
4412       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4413       ptest(vec1, vec2);
4414       jcc(Assembler::zero, DONE);
4415       jmpb(TAIL_START);
4416 
4417       bind(BREAK_LOOP);
4418       // At least one byte in the last 16-byte vector is negative.
4419       // Set up and look at the last 16 bytes as if they were a tail
4420       lea(ary1, Address(ary1, len, Address::times_1));
4421       addptr(result, len);
4422       // Ignore the very last byte: if all others are positive,
4423       // it must be negative, so we can skip right to the 2+1 byte
4424       // end comparison at this point
4425       orl(result, 15);
4426       movl(len, 15);
4427       // Fallthru to tail compare
4428     }
4429   }
4430 
4431   bind(TAIL_START);
4432   // Compare 4-byte vectors
4433   andl(len, 0xfffffffc); // vector count (in bytes)
4434   jccb(Assembler::zero, COMPARE_CHAR);
4435 
4436   lea(ary1, Address(ary1, len, Address::times_1));
4437   negptr(len);
4438 
4439   bind(COMPARE_VECTORS);
4440   movl(tmp1, Address(ary1, len, Address::times_1));
4441   andl(tmp1, 0x80808080);
4442   jccb(Assembler::notZero, TAIL_ADJUST);
4443   addptr(len, 4);
4444   jccb(Assembler::notZero, COMPARE_VECTORS);
4445 
4446   // Compare trailing char (final 2-3 bytes), if any
4447   bind(COMPARE_CHAR);
4448 
4449   testl(result, 0x2);   // tail  char
4450   jccb(Assembler::zero, COMPARE_BYTE);
4451   load_unsigned_short(tmp1, Address(ary1, 0));
4452   andl(tmp1, 0x00008080);
4453   jccb(Assembler::notZero, CHAR_ADJUST);
4454   lea(ary1, Address(ary1, 2));
4455 
4456   bind(COMPARE_BYTE);
4457   testl(result, 0x1);   // tail  byte
4458   jccb(Assembler::zero, DONE);
4459   load_unsigned_byte(tmp1, Address(ary1, 0));
4460   testl(tmp1, 0x00000080);
4461   jccb(Assembler::zero, DONE);
4462   subptr(result, 1);
4463   jmpb(DONE);
4464 
4465   bind(TAIL_ADJUST);
4466   // there are negative bits in the last 4 byte block.
4467   // Adjust result and check the next three bytes
4468   addptr(result, len);
4469   orl(result, 3);
4470   lea(ary1, Address(ary1, len, Address::times_1));
4471   jmpb(COMPARE_CHAR);
4472 
4473   bind(CHAR_ADJUST);
4474   // We are looking at a char + optional byte tail, and found that one
4475   // of the bytes in the char is negative. Adjust the result, check the
4476   // first byte and readjust if needed.
4477   andl(result, 0xfffffffc);
4478   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4479   jccb(Assembler::notZero, DONE);
4480   addptr(result, 1);
4481 
4482   // That's it
4483   bind(DONE);
4484   if (UseAVX >= 2 && UseSSE >= 2) {
4485     // clean upper bits of YMM registers
4486     vpxor(vec1, vec1);
4487     vpxor(vec2, vec2);
4488   }
4489 }
4490 
4491 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4492 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4493                                       Register limit, Register result, Register chr,
4494                                       XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
4495   ShortBranchVerifier sbv(this);
4496   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4497 
4498   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4499   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4500 
4501   if (is_array_equ) {
4502     // Check the input args
4503     cmpoop(ary1, ary2);
4504     jcc(Assembler::equal, TRUE_LABEL);
4505 
4506     // Need additional checks for arrays_equals.
4507     testptr(ary1, ary1);
4508     jcc(Assembler::zero, FALSE_LABEL);
4509     testptr(ary2, ary2);
4510     jcc(Assembler::zero, FALSE_LABEL);
4511 
4512     // Check the lengths
4513     movl(limit, Address(ary1, length_offset));
4514     cmpl(limit, Address(ary2, length_offset));
4515     jcc(Assembler::notEqual, FALSE_LABEL);
4516   }
4517 
4518   // count == 0
4519   testl(limit, limit);
4520   jcc(Assembler::zero, TRUE_LABEL);
4521 
4522   if (is_array_equ) {
4523     // Load array address
4524     lea(ary1, Address(ary1, base_offset));
4525     lea(ary2, Address(ary2, base_offset));
4526   }
4527 
4528   if (is_array_equ && is_char) {
4529     // arrays_equals when used for char[].
4530     shll(limit, 1);      // byte count != 0
4531   }
4532   movl(result, limit); // copy
4533 
4534   if (UseAVX >= 2) {
4535     // With AVX2, use 32-byte vector compare
4536     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4537 
4538     // Compare 32-byte vectors
4539     andl(result, 0x0000001f);  //   tail count (in bytes)
4540     andl(limit, 0xffffffe0);   // vector count (in bytes)
4541     jcc(Assembler::zero, COMPARE_TAIL);
4542 
4543     lea(ary1, Address(ary1, limit, Address::times_1));
4544     lea(ary2, Address(ary2, limit, Address::times_1));
4545     negptr(limit);
4546 
4547 #ifdef _LP64
4548     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4549       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4550 
4551       cmpl(limit, -64);
4552       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4553 
4554       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4555 
4556       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4557       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4558       kortestql(mask, mask);
4559       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4560       addptr(limit, 64);  // update since we already compared at this addr
4561       cmpl(limit, -64);
4562       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4563 
4564       // At this point we may still need to compare -limit+result bytes.
4565       // We could execute the next two instruction and just continue via non-wide path:
4566       //  cmpl(limit, 0);
4567       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4568       // But since we stopped at the points ary{1,2}+limit which are
4569       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4570       // (|limit| <= 32 and result < 32),
4571       // we may just compare the last 64 bytes.
4572       //
4573       addptr(result, -64);   // it is safe, bc we just came from this area
4574       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4575       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4576       kortestql(mask, mask);
4577       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4578 
4579       jmp(TRUE_LABEL);
4580 
4581       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4582 
4583     }//if (VM_Version::supports_avx512vlbw())
4584 #endif //_LP64
4585     bind(COMPARE_WIDE_VECTORS);
4586     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
4587     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4588     vpxor(vec1, vec2);
4589 
4590     vptest(vec1, vec1);
4591     jcc(Assembler::notZero, FALSE_LABEL);
4592     addptr(limit, 32);
4593     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4594 
4595     testl(result, result);
4596     jcc(Assembler::zero, TRUE_LABEL);
4597 
4598     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
4599     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4600     vpxor(vec1, vec2);
4601 
4602     vptest(vec1, vec1);
4603     jccb(Assembler::notZero, FALSE_LABEL);
4604     jmpb(TRUE_LABEL);
4605 
4606     bind(COMPARE_TAIL); // limit is zero
4607     movl(limit, result);
4608     // Fallthru to tail compare
4609   } else if (UseSSE42Intrinsics) {
4610     // With SSE4.2, use double quad vector compare
4611     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4612 
4613     // Compare 16-byte vectors
4614     andl(result, 0x0000000f);  //   tail count (in bytes)
4615     andl(limit, 0xfffffff0);   // vector count (in bytes)
4616     jcc(Assembler::zero, COMPARE_TAIL);
4617 
4618     lea(ary1, Address(ary1, limit, Address::times_1));
4619     lea(ary2, Address(ary2, limit, Address::times_1));
4620     negptr(limit);
4621 
4622     bind(COMPARE_WIDE_VECTORS);
4623     movdqu(vec1, Address(ary1, limit, Address::times_1));
4624     movdqu(vec2, Address(ary2, limit, Address::times_1));
4625     pxor(vec1, vec2);
4626 
4627     ptest(vec1, vec1);
4628     jcc(Assembler::notZero, FALSE_LABEL);
4629     addptr(limit, 16);
4630     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4631 
4632     testl(result, result);
4633     jcc(Assembler::zero, TRUE_LABEL);
4634 
4635     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4636     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4637     pxor(vec1, vec2);
4638 
4639     ptest(vec1, vec1);
4640     jccb(Assembler::notZero, FALSE_LABEL);
4641     jmpb(TRUE_LABEL);
4642 
4643     bind(COMPARE_TAIL); // limit is zero
4644     movl(limit, result);
4645     // Fallthru to tail compare
4646   }
4647 
4648   // Compare 4-byte vectors
4649   andl(limit, 0xfffffffc); // vector count (in bytes)
4650   jccb(Assembler::zero, COMPARE_CHAR);
4651 
4652   lea(ary1, Address(ary1, limit, Address::times_1));
4653   lea(ary2, Address(ary2, limit, Address::times_1));
4654   negptr(limit);
4655 
4656   bind(COMPARE_VECTORS);
4657   movl(chr, Address(ary1, limit, Address::times_1));
4658   cmpl(chr, Address(ary2, limit, Address::times_1));
4659   jccb(Assembler::notEqual, FALSE_LABEL);
4660   addptr(limit, 4);
4661   jcc(Assembler::notZero, COMPARE_VECTORS);
4662 
4663   // Compare trailing char (final 2 bytes), if any
4664   bind(COMPARE_CHAR);
4665   testl(result, 0x2);   // tail  char
4666   jccb(Assembler::zero, COMPARE_BYTE);
4667   load_unsigned_short(chr, Address(ary1, 0));
4668   load_unsigned_short(limit, Address(ary2, 0));
4669   cmpl(chr, limit);
4670   jccb(Assembler::notEqual, FALSE_LABEL);
4671 
4672   if (is_array_equ && is_char) {
4673     bind(COMPARE_BYTE);
4674   } else {
4675     lea(ary1, Address(ary1, 2));
4676     lea(ary2, Address(ary2, 2));
4677 
4678     bind(COMPARE_BYTE);
4679     testl(result, 0x1);   // tail  byte
4680     jccb(Assembler::zero, TRUE_LABEL);
4681     load_unsigned_byte(chr, Address(ary1, 0));
4682     load_unsigned_byte(limit, Address(ary2, 0));
4683     cmpl(chr, limit);
4684     jccb(Assembler::notEqual, FALSE_LABEL);
4685   }
4686   bind(TRUE_LABEL);
4687   movl(result, 1);   // return true
4688   jmpb(DONE);
4689 
4690   bind(FALSE_LABEL);
4691   xorl(result, result); // return false
4692 
4693   // That's it
4694   bind(DONE);
4695   if (UseAVX >= 2) {
4696     // clean upper bits of YMM registers
4697     vpxor(vec1, vec1);
4698     vpxor(vec2, vec2);
4699   }
4700 }
4701 
4702 #ifdef _LP64
4703 
4704 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4705 #define __ masm.
4706   Register dst = stub.data<0>();
4707   XMMRegister src = stub.data<1>();
4708   address target = stub.data<2>();
4709   __ bind(stub.entry());
4710   __ subptr(rsp, 8);
4711   __ movdbl(Address(rsp), src);
4712   __ call(RuntimeAddress(target));
4713   __ pop(dst);
4714   __ jmp(stub.continuation());
4715 #undef __
4716 }
4717 
4718 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4719   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4720   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4721 
4722   address slowpath_target;
4723   if (dst_bt == T_INT) {
4724     if (src_bt == T_FLOAT) {
4725       cvttss2sil(dst, src);
4726       cmpl(dst, 0x80000000);
4727       slowpath_target = StubRoutines::x86::f2i_fixup();
4728     } else {
4729       cvttsd2sil(dst, src);
4730       cmpl(dst, 0x80000000);
4731       slowpath_target = StubRoutines::x86::d2i_fixup();
4732     }
4733   } else {
4734     if (src_bt == T_FLOAT) {
4735       cvttss2siq(dst, src);
4736       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4737       slowpath_target = StubRoutines::x86::f2l_fixup();
4738     } else {
4739       cvttsd2siq(dst, src);
4740       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4741       slowpath_target = StubRoutines::x86::d2l_fixup();
4742     }
4743   }
4744 
4745   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4746   jcc(Assembler::equal, stub->entry());
4747   bind(stub->continuation());
4748 }
4749 
4750 #endif // _LP64
4751 
4752 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4753                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4754   switch(ideal_opc) {
4755     case Op_LShiftVS:
4756       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4757     case Op_LShiftVI:
4758       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4759     case Op_LShiftVL:
4760       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4761     case Op_RShiftVS:
4762       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4763     case Op_RShiftVI:
4764       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4765     case Op_RShiftVL:
4766       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4767     case Op_URShiftVS:
4768       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4769     case Op_URShiftVI:
4770       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4771     case Op_URShiftVL:
4772       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4773     case Op_RotateRightV:
4774       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4775     case Op_RotateLeftV:
4776       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4777     default:
4778       fatal("Unsupported masked operation"); break;
4779   }
4780 }
4781 
4782 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4783                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4784                                     bool is_varshift) {
4785   switch (ideal_opc) {
4786     case Op_AddVB:
4787       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4788     case Op_AddVS:
4789       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4790     case Op_AddVI:
4791       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4792     case Op_AddVL:
4793       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4794     case Op_AddVF:
4795       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4796     case Op_AddVD:
4797       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4798     case Op_SubVB:
4799       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4800     case Op_SubVS:
4801       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4802     case Op_SubVI:
4803       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4804     case Op_SubVL:
4805       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4806     case Op_SubVF:
4807       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4808     case Op_SubVD:
4809       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4810     case Op_MulVS:
4811       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4812     case Op_MulVI:
4813       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4814     case Op_MulVL:
4815       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4816     case Op_MulVF:
4817       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4818     case Op_MulVD:
4819       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4820     case Op_DivVF:
4821       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4822     case Op_DivVD:
4823       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4824     case Op_SqrtVF:
4825       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4826     case Op_SqrtVD:
4827       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4828     case Op_AbsVB:
4829       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4830     case Op_AbsVS:
4831       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4832     case Op_AbsVI:
4833       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4834     case Op_AbsVL:
4835       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4836     case Op_FmaVF:
4837       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4838     case Op_FmaVD:
4839       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4840     case Op_VectorRearrange:
4841       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4842     case Op_LShiftVS:
4843       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4844     case Op_LShiftVI:
4845       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4846     case Op_LShiftVL:
4847       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4848     case Op_RShiftVS:
4849       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4850     case Op_RShiftVI:
4851       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4852     case Op_RShiftVL:
4853       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4854     case Op_URShiftVS:
4855       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4856     case Op_URShiftVI:
4857       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4858     case Op_URShiftVL:
4859       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4860     case Op_RotateLeftV:
4861       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4862     case Op_RotateRightV:
4863       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4864     case Op_MaxV:
4865       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4866     case Op_MinV:
4867       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4868     case Op_XorV:
4869       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4870     case Op_OrV:
4871       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4872     case Op_AndV:
4873       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4874     default:
4875       fatal("Unsupported masked operation"); break;
4876   }
4877 }
4878 
4879 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4880                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4881   switch (ideal_opc) {
4882     case Op_AddVB:
4883       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4884     case Op_AddVS:
4885       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4886     case Op_AddVI:
4887       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4888     case Op_AddVL:
4889       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4890     case Op_AddVF:
4891       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4892     case Op_AddVD:
4893       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4894     case Op_SubVB:
4895       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4896     case Op_SubVS:
4897       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4898     case Op_SubVI:
4899       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4900     case Op_SubVL:
4901       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4902     case Op_SubVF:
4903       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4904     case Op_SubVD:
4905       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4906     case Op_MulVS:
4907       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4908     case Op_MulVI:
4909       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4910     case Op_MulVL:
4911       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4912     case Op_MulVF:
4913       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4914     case Op_MulVD:
4915       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4916     case Op_DivVF:
4917       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4918     case Op_DivVD:
4919       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4920     case Op_FmaVF:
4921       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4922     case Op_FmaVD:
4923       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4924     case Op_MaxV:
4925       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4926     case Op_MinV:
4927       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4928     case Op_XorV:
4929       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4930     case Op_OrV:
4931       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4932     case Op_AndV:
4933       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4934     default:
4935       fatal("Unsupported masked operation"); break;
4936   }
4937 }
4938 
4939 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4940                                   KRegister src1, KRegister src2) {
4941   BasicType etype = T_ILLEGAL;
4942   switch(mask_len) {
4943     case 2:
4944     case 4:
4945     case 8:  etype = T_BYTE; break;
4946     case 16: etype = T_SHORT; break;
4947     case 32: etype = T_INT; break;
4948     case 64: etype = T_LONG; break;
4949     default: fatal("Unsupported type"); break;
4950   }
4951   assert(etype != T_ILLEGAL, "");
4952   switch(ideal_opc) {
4953     case Op_AndVMask:
4954       kand(etype, dst, src1, src2); break;
4955     case Op_OrVMask:
4956       kor(etype, dst, src1, src2); break;
4957     case Op_XorVMask:
4958       kxor(etype, dst, src1, src2); break;
4959     default:
4960       fatal("Unsupported masked operation"); break;
4961   }
4962 }
4963 
4964 /*
4965  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4966  * If src is NaN, the result is 0.
4967  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4968  * the result is equal to the value of Integer.MIN_VALUE.
4969  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4970  * the result is equal to the value of Integer.MAX_VALUE.
4971  */
4972 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4973                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4974                                                                    Register rscratch, AddressLiteral float_sign_flip,
4975                                                                    int vec_enc) {
4976   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4977   Label done;
4978   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4979   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4980   vptest(xtmp2, xtmp2, vec_enc);
4981   jccb(Assembler::equal, done);
4982 
4983   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4984   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4985 
4986   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4987   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4988   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4989 
4990   // Recompute the mask for remaining special value.
4991   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4992   // Extract SRC values corresponding to TRUE mask lanes.
4993   vpand(xtmp4, xtmp2, src, vec_enc);
4994   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4995   // values are set.
4996   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4997 
4998   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4999   bind(done);
5000 }
5001 
5002 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5003                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5004                                                                     Register rscratch, AddressLiteral float_sign_flip,
5005                                                                     int vec_enc) {
5006   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5007   Label done;
5008   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5009   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5010   kortestwl(ktmp1, ktmp1);
5011   jccb(Assembler::equal, done);
5012 
5013   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5014   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5015   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5016 
5017   kxorwl(ktmp1, ktmp1, ktmp2);
5018   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5019   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5020   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5021   bind(done);
5022 }
5023 
5024 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5025                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5026                                                                      Register rscratch, AddressLiteral double_sign_flip,
5027                                                                      int vec_enc) {
5028   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5029 
5030   Label done;
5031   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5032   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
5033   kortestwl(ktmp1, ktmp1);
5034   jccb(Assembler::equal, done);
5035 
5036   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5037   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5038   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5039 
5040   kxorwl(ktmp1, ktmp1, ktmp2);
5041   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5042   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5043   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5044   bind(done);
5045 }
5046 
5047 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5048                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5049                                                                      Register rscratch, AddressLiteral float_sign_flip,
5050                                                                      int vec_enc) {
5051   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5052   Label done;
5053   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5054   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5055   kortestwl(ktmp1, ktmp1);
5056   jccb(Assembler::equal, done);
5057 
5058   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5059   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5060   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5061 
5062   kxorwl(ktmp1, ktmp1, ktmp2);
5063   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5064   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5065   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5066   bind(done);
5067 }
5068 
5069 /*
5070  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5071  * If src is NaN, the result is 0.
5072  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5073  * the result is equal to the value of Long.MIN_VALUE.
5074  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5075  * the result is equal to the value of Long.MAX_VALUE.
5076  */
5077 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5078                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5079                                                                       Register rscratch, AddressLiteral double_sign_flip,
5080                                                                       int vec_enc) {
5081   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5082 
5083   Label done;
5084   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5085   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5086   kortestwl(ktmp1, ktmp1);
5087   jccb(Assembler::equal, done);
5088 
5089   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5090   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5091   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5092 
5093   kxorwl(ktmp1, ktmp1, ktmp2);
5094   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5095   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5096   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5097   bind(done);
5098 }
5099 
5100 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5101                                                              XMMRegister xtmp, int index, int vec_enc) {
5102    assert(vec_enc < Assembler::AVX_512bit, "");
5103    if (vec_enc == Assembler::AVX_256bit) {
5104      vextractf128_high(xtmp, src);
5105      vshufps(dst, src, xtmp, index, vec_enc);
5106    } else {
5107      vshufps(dst, src, zero, index, vec_enc);
5108    }
5109 }
5110 
5111 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5112                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5113                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5114   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5115 
5116   Label done;
5117   // Compare the destination lanes with float_sign_flip
5118   // value to get mask for all special values.
5119   movdqu(xtmp1, float_sign_flip, rscratch);
5120   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5121   ptest(xtmp2, xtmp2);
5122   jccb(Assembler::equal, done);
5123 
5124   // Flip float_sign_flip to get max integer value.
5125   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5126   pxor(xtmp1, xtmp4);
5127 
5128   // Set detination lanes corresponding to unordered source lanes as zero.
5129   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5130   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5131 
5132   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5133   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5134   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5135 
5136   // Recompute the mask for remaining special value.
5137   pxor(xtmp2, xtmp3);
5138   // Extract mask corresponding to non-negative source lanes.
5139   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5140 
5141   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5142   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5143   pand(xtmp3, xtmp2);
5144 
5145   // Replace destination lanes holding special value(0x80000000) with max int
5146   // if corresponding source lane holds a +ve value.
5147   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5148   bind(done);
5149 }
5150 
5151 
5152 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5153                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5154   switch(to_elem_bt) {
5155     case T_SHORT:
5156       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5157       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5158       vpackusdw(dst, dst, zero, vec_enc);
5159       if (vec_enc == Assembler::AVX_256bit) {
5160         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5161       }
5162       break;
5163     case  T_BYTE:
5164       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5165       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5166       vpackusdw(dst, dst, zero, vec_enc);
5167       if (vec_enc == Assembler::AVX_256bit) {
5168         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5169       }
5170       vpackuswb(dst, dst, zero, vec_enc);
5171       break;
5172     default: assert(false, "%s", type2name(to_elem_bt));
5173   }
5174 }
5175 
5176 /*
5177  * Algorithm for vector D2L and F2I conversions:-
5178  * a) Perform vector D2L/F2I cast.
5179  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5180  *    It signifies that source value could be any of the special floating point
5181  *    values(NaN,-Inf,Inf,Max,-Min).
5182  * c) Set destination to zero if source is NaN value.
5183  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5184  */
5185 
5186 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5187                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5188                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5189   int to_elem_sz = type2aelembytes(to_elem_bt);
5190   assert(to_elem_sz <= 4, "");
5191   vcvttps2dq(dst, src, vec_enc);
5192   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5193   if (to_elem_sz < 4) {
5194     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5195     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5196   }
5197 }
5198 
5199 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5200                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5201                                             Register rscratch, int vec_enc) {
5202   int to_elem_sz = type2aelembytes(to_elem_bt);
5203   assert(to_elem_sz <= 4, "");
5204   vcvttps2dq(dst, src, vec_enc);
5205   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5206   switch(to_elem_bt) {
5207     case T_INT:
5208       break;
5209     case T_SHORT:
5210       evpmovdw(dst, dst, vec_enc);
5211       break;
5212     case T_BYTE:
5213       evpmovdb(dst, dst, vec_enc);
5214       break;
5215     default: assert(false, "%s", type2name(to_elem_bt));
5216   }
5217 }
5218 
5219 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5220                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5221                                             Register rscratch, int vec_enc) {
5222   evcvttps2qq(dst, src, vec_enc);
5223   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5224 }
5225 
5226 // Handling for downcasting from double to integer or sub-word types on AVX2.
5227 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5228                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5229                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5230   int to_elem_sz = type2aelembytes(to_elem_bt);
5231   assert(to_elem_sz < 8, "");
5232   vcvttpd2dq(dst, src, vec_enc);
5233   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5234                                               float_sign_flip, vec_enc);
5235   if (to_elem_sz < 4) {
5236     // xtmp4 holds all zero lanes.
5237     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5238   }
5239 }
5240 
5241 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5242                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5243                                             KRegister ktmp2, AddressLiteral sign_flip,
5244                                             Register rscratch, int vec_enc) {
5245   if (VM_Version::supports_avx512dq()) {
5246     evcvttpd2qq(dst, src, vec_enc);
5247     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5248     switch(to_elem_bt) {
5249       case T_LONG:
5250         break;
5251       case T_INT:
5252         evpmovsqd(dst, dst, vec_enc);
5253         break;
5254       case T_SHORT:
5255         evpmovsqd(dst, dst, vec_enc);
5256         evpmovdw(dst, dst, vec_enc);
5257         break;
5258       case T_BYTE:
5259         evpmovsqd(dst, dst, vec_enc);
5260         evpmovdb(dst, dst, vec_enc);
5261         break;
5262       default: assert(false, "%s", type2name(to_elem_bt));
5263     }
5264   } else {
5265     assert(type2aelembytes(to_elem_bt) <= 4, "");
5266     vcvttpd2dq(dst, src, vec_enc);
5267     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5268     switch(to_elem_bt) {
5269       case T_INT:
5270         break;
5271       case T_SHORT:
5272         evpmovdw(dst, dst, vec_enc);
5273         break;
5274       case T_BYTE:
5275         evpmovdb(dst, dst, vec_enc);
5276         break;
5277       default: assert(false, "%s", type2name(to_elem_bt));
5278     }
5279   }
5280 }
5281 
5282 #ifdef _LP64
5283 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5284                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5285                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5286   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5287   // and re-instantiate original MXCSR.RC mode after that.
5288   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5289 
5290   mov64(tmp, julong_cast(0.5L));
5291   evpbroadcastq(xtmp1, tmp, vec_enc);
5292   vaddpd(xtmp1, src , xtmp1, vec_enc);
5293   evcvtpd2qq(dst, xtmp1, vec_enc);
5294   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5295                                                 double_sign_flip, vec_enc);;
5296 
5297   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5298 }
5299 
5300 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5301                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5302                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5303   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5304   // and re-instantiate original MXCSR.RC mode after that.
5305   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5306 
5307   movl(tmp, jint_cast(0.5));
5308   movq(xtmp1, tmp);
5309   vbroadcastss(xtmp1, xtmp1, vec_enc);
5310   vaddps(xtmp1, src , xtmp1, vec_enc);
5311   vcvtps2dq(dst, xtmp1, vec_enc);
5312   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5313                                               float_sign_flip, vec_enc);
5314 
5315   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5316 }
5317 
5318 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5319                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5320                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5321   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5322   // and re-instantiate original MXCSR.RC mode after that.
5323   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5324 
5325   movl(tmp, jint_cast(0.5));
5326   movq(xtmp1, tmp);
5327   vbroadcastss(xtmp1, xtmp1, vec_enc);
5328   vaddps(xtmp1, src , xtmp1, vec_enc);
5329   vcvtps2dq(dst, xtmp1, vec_enc);
5330   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5331 
5332   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5333 }
5334 #endif // _LP64
5335 
5336 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5337                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5338   switch (from_elem_bt) {
5339     case T_BYTE:
5340       switch (to_elem_bt) {
5341         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5342         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5343         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5344         default: ShouldNotReachHere();
5345       }
5346       break;
5347     case T_SHORT:
5348       switch (to_elem_bt) {
5349         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5350         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5351         default: ShouldNotReachHere();
5352       }
5353       break;
5354     case T_INT:
5355       assert(to_elem_bt == T_LONG, "");
5356       vpmovzxdq(dst, src, vlen_enc);
5357       break;
5358     default:
5359       ShouldNotReachHere();
5360   }
5361 }
5362 
5363 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5364                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5365   switch (from_elem_bt) {
5366     case T_BYTE:
5367       switch (to_elem_bt) {
5368         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5369         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5370         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5371         default: ShouldNotReachHere();
5372       }
5373       break;
5374     case T_SHORT:
5375       switch (to_elem_bt) {
5376         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5377         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5378         default: ShouldNotReachHere();
5379       }
5380       break;
5381     case T_INT:
5382       assert(to_elem_bt == T_LONG, "");
5383       vpmovsxdq(dst, src, vlen_enc);
5384       break;
5385     default:
5386       ShouldNotReachHere();
5387   }
5388 }
5389 
5390 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5391                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5392   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5393   assert(vlen_enc != AVX_512bit, "");
5394 
5395   int dst_bt_size = type2aelembytes(dst_bt);
5396   int src_bt_size = type2aelembytes(src_bt);
5397   if (dst_bt_size > src_bt_size) {
5398     switch (dst_bt_size / src_bt_size) {
5399       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5400       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5401       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5402       default: ShouldNotReachHere();
5403     }
5404   } else {
5405     assert(dst_bt_size < src_bt_size, "");
5406     switch (src_bt_size / dst_bt_size) {
5407       case 2: {
5408         if (vlen_enc == AVX_128bit) {
5409           vpacksswb(dst, src, src, vlen_enc);
5410         } else {
5411           vpacksswb(dst, src, src, vlen_enc);
5412           vpermq(dst, dst, 0x08, vlen_enc);
5413         }
5414         break;
5415       }
5416       case 4: {
5417         if (vlen_enc == AVX_128bit) {
5418           vpackssdw(dst, src, src, vlen_enc);
5419           vpacksswb(dst, dst, dst, vlen_enc);
5420         } else {
5421           vpackssdw(dst, src, src, vlen_enc);
5422           vpermq(dst, dst, 0x08, vlen_enc);
5423           vpacksswb(dst, dst, dst, AVX_128bit);
5424         }
5425         break;
5426       }
5427       case 8: {
5428         if (vlen_enc == AVX_128bit) {
5429           vpshufd(dst, src, 0x08, vlen_enc);
5430           vpackssdw(dst, dst, dst, vlen_enc);
5431           vpacksswb(dst, dst, dst, vlen_enc);
5432         } else {
5433           vpshufd(dst, src, 0x08, vlen_enc);
5434           vpermq(dst, dst, 0x08, vlen_enc);
5435           vpackssdw(dst, dst, dst, AVX_128bit);
5436           vpacksswb(dst, dst, dst, AVX_128bit);
5437         }
5438         break;
5439       }
5440       default: ShouldNotReachHere();
5441     }
5442   }
5443 }
5444 
5445 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5446                                    bool merge, BasicType bt, int vlen_enc) {
5447   if (bt == T_INT) {
5448     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5449   } else {
5450     assert(bt == T_LONG, "");
5451     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5452   }
5453 }
5454 
5455 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5456                                    bool merge, BasicType bt, int vlen_enc) {
5457   if (bt == T_INT) {
5458     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5459   } else {
5460     assert(bt == T_LONG, "");
5461     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5462   }
5463 }
5464 
5465 #ifdef _LP64
5466 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5467                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5468                                                int vec_enc) {
5469   int index = 0;
5470   int vindex = 0;
5471   mov64(rtmp1, 0x0101010101010101L);
5472   pdepq(rtmp1, src, rtmp1);
5473   if (mask_len > 8) {
5474     movq(rtmp2, src);
5475     vpxor(xtmp, xtmp, xtmp, vec_enc);
5476     movq(xtmp, rtmp1);
5477   }
5478   movq(dst, rtmp1);
5479 
5480   mask_len -= 8;
5481   while (mask_len > 0) {
5482     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5483     index++;
5484     if ((index % 2) == 0) {
5485       pxor(xtmp, xtmp);
5486     }
5487     mov64(rtmp1, 0x0101010101010101L);
5488     shrq(rtmp2, 8);
5489     pdepq(rtmp1, rtmp2, rtmp1);
5490     pinsrq(xtmp, rtmp1, index % 2);
5491     vindex = index / 2;
5492     if (vindex) {
5493       // Write entire 16 byte vector when both 64 bit
5494       // lanes are update to save redundant instructions.
5495       if (index % 2) {
5496         vinsertf128(dst, dst, xtmp, vindex);
5497       }
5498     } else {
5499       vmovdqu(dst, xtmp);
5500     }
5501     mask_len -= 8;
5502   }
5503 }
5504 
5505 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5506   switch(opc) {
5507     case Op_VectorMaskTrueCount:
5508       popcntq(dst, tmp);
5509       break;
5510     case Op_VectorMaskLastTrue:
5511       if (VM_Version::supports_lzcnt()) {
5512         lzcntq(tmp, tmp);
5513         movl(dst, 63);
5514         subl(dst, tmp);
5515       } else {
5516         movl(dst, -1);
5517         bsrq(tmp, tmp);
5518         cmov32(Assembler::notZero, dst, tmp);
5519       }
5520       break;
5521     case Op_VectorMaskFirstTrue:
5522       if (VM_Version::supports_bmi1()) {
5523         if (masklen < 32) {
5524           orl(tmp, 1 << masklen);
5525           tzcntl(dst, tmp);
5526         } else if (masklen == 32) {
5527           tzcntl(dst, tmp);
5528         } else {
5529           assert(masklen == 64, "");
5530           tzcntq(dst, tmp);
5531         }
5532       } else {
5533         if (masklen < 32) {
5534           orl(tmp, 1 << masklen);
5535           bsfl(dst, tmp);
5536         } else {
5537           assert(masklen == 32 || masklen == 64, "");
5538           movl(dst, masklen);
5539           if (masklen == 32)  {
5540             bsfl(tmp, tmp);
5541           } else {
5542             bsfq(tmp, tmp);
5543           }
5544           cmov32(Assembler::notZero, dst, tmp);
5545         }
5546       }
5547       break;
5548     case Op_VectorMaskToLong:
5549       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5550       break;
5551     default: assert(false, "Unhandled mask operation");
5552   }
5553 }
5554 
5555 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5556                                               int masklen, int masksize, int vec_enc) {
5557   assert(VM_Version::supports_popcnt(), "");
5558 
5559   if(VM_Version::supports_avx512bw()) {
5560     kmovql(tmp, mask);
5561   } else {
5562     assert(masklen <= 16, "");
5563     kmovwl(tmp, mask);
5564   }
5565 
5566   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5567   // operations needs to be clipped.
5568   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5569     andq(tmp, (1 << masklen) - 1);
5570   }
5571 
5572   vector_mask_operation_helper(opc, dst, tmp, masklen);
5573 }
5574 
5575 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5576                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5577   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5578          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5579   assert(VM_Version::supports_popcnt(), "");
5580 
5581   bool need_clip = false;
5582   switch(bt) {
5583     case T_BOOLEAN:
5584       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5585       vpxor(xtmp, xtmp, xtmp, vec_enc);
5586       vpsubb(xtmp, xtmp, mask, vec_enc);
5587       vpmovmskb(tmp, xtmp, vec_enc);
5588       need_clip = masklen < 16;
5589       break;
5590     case T_BYTE:
5591       vpmovmskb(tmp, mask, vec_enc);
5592       need_clip = masklen < 16;
5593       break;
5594     case T_SHORT:
5595       vpacksswb(xtmp, mask, mask, vec_enc);
5596       if (masklen >= 16) {
5597         vpermpd(xtmp, xtmp, 8, vec_enc);
5598       }
5599       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5600       need_clip = masklen < 16;
5601       break;
5602     case T_INT:
5603     case T_FLOAT:
5604       vmovmskps(tmp, mask, vec_enc);
5605       need_clip = masklen < 4;
5606       break;
5607     case T_LONG:
5608     case T_DOUBLE:
5609       vmovmskpd(tmp, mask, vec_enc);
5610       need_clip = masklen < 2;
5611       break;
5612     default: assert(false, "Unhandled type, %s", type2name(bt));
5613   }
5614 
5615   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5616   // operations needs to be clipped.
5617   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5618     // need_clip implies masklen < 32
5619     andq(tmp, (1 << masklen) - 1);
5620   }
5621 
5622   vector_mask_operation_helper(opc, dst, tmp, masklen);
5623 }
5624 
5625 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5626                                              Register rtmp2, int mask_len) {
5627   kmov(rtmp1, src);
5628   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5629   mov64(rtmp2, -1L);
5630   pextq(rtmp2, rtmp2, rtmp1);
5631   kmov(dst, rtmp2);
5632 }
5633 
5634 #ifdef _LP64
5635 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5636                                                     XMMRegister mask, Register rtmp, Register rscratch,
5637                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5638                                                     int vec_enc) {
5639   assert(type2aelembytes(bt) >= 4, "");
5640   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5641   address compress_perm_table = nullptr;
5642   address expand_perm_table = nullptr;
5643   if (type2aelembytes(bt) == 8) {
5644     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5645     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5646     vmovmskpd(rtmp, mask, vec_enc);
5647   } else {
5648     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5649     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5650     vmovmskps(rtmp, mask, vec_enc);
5651   }
5652   shlq(rtmp, 5); // for 32 byte permute row.
5653   if (opcode == Op_CompressV) {
5654     lea(rscratch, ExternalAddress(compress_perm_table));
5655   } else {
5656     lea(rscratch, ExternalAddress(expand_perm_table));
5657   }
5658   addptr(rtmp, rscratch);
5659   vmovdqu(permv, Address(rtmp));
5660   vpermps(dst, permv, src, Assembler::AVX_256bit);
5661   vpxor(xtmp, xtmp, xtmp, vec_enc);
5662   // Blend the result with zero vector using permute mask, each column entry
5663   // in a permute table row contains either a valid permute index or a -1 (default)
5664   // value, this can potentially be used as a blending mask after
5665   // compressing/expanding the source vector lanes.
5666   vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
5667 }
5668 #endif
5669 
5670 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5671                                                bool merge, BasicType bt, int vec_enc) {
5672   if (opcode == Op_CompressV) {
5673     switch(bt) {
5674     case T_BYTE:
5675       evpcompressb(dst, mask, src, merge, vec_enc);
5676       break;
5677     case T_CHAR:
5678     case T_SHORT:
5679       evpcompressw(dst, mask, src, merge, vec_enc);
5680       break;
5681     case T_INT:
5682       evpcompressd(dst, mask, src, merge, vec_enc);
5683       break;
5684     case T_FLOAT:
5685       evcompressps(dst, mask, src, merge, vec_enc);
5686       break;
5687     case T_LONG:
5688       evpcompressq(dst, mask, src, merge, vec_enc);
5689       break;
5690     case T_DOUBLE:
5691       evcompresspd(dst, mask, src, merge, vec_enc);
5692       break;
5693     default:
5694       fatal("Unsupported type %s", type2name(bt));
5695       break;
5696     }
5697   } else {
5698     assert(opcode == Op_ExpandV, "");
5699     switch(bt) {
5700     case T_BYTE:
5701       evpexpandb(dst, mask, src, merge, vec_enc);
5702       break;
5703     case T_CHAR:
5704     case T_SHORT:
5705       evpexpandw(dst, mask, src, merge, vec_enc);
5706       break;
5707     case T_INT:
5708       evpexpandd(dst, mask, src, merge, vec_enc);
5709       break;
5710     case T_FLOAT:
5711       evexpandps(dst, mask, src, merge, vec_enc);
5712       break;
5713     case T_LONG:
5714       evpexpandq(dst, mask, src, merge, vec_enc);
5715       break;
5716     case T_DOUBLE:
5717       evexpandpd(dst, mask, src, merge, vec_enc);
5718       break;
5719     default:
5720       fatal("Unsupported type %s", type2name(bt));
5721       break;
5722     }
5723   }
5724 }
5725 #endif
5726 
5727 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5728                                            KRegister ktmp1, int vec_enc) {
5729   if (opcode == Op_SignumVD) {
5730     vsubpd(dst, zero, one, vec_enc);
5731     // if src < 0 ? -1 : 1
5732     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5733     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5734     // if src == NaN, -0.0 or 0.0 return src.
5735     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5736     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5737   } else {
5738     assert(opcode == Op_SignumVF, "");
5739     vsubps(dst, zero, one, vec_enc);
5740     // if src < 0 ? -1 : 1
5741     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5742     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5743     // if src == NaN, -0.0 or 0.0 return src.
5744     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5745     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5746   }
5747 }
5748 
5749 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5750                                           XMMRegister xtmp1, int vec_enc) {
5751   if (opcode == Op_SignumVD) {
5752     vsubpd(dst, zero, one, vec_enc);
5753     // if src < 0 ? -1 : 1
5754     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5755     // if src == NaN, -0.0 or 0.0 return src.
5756     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5757     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5758   } else {
5759     assert(opcode == Op_SignumVF, "");
5760     vsubps(dst, zero, one, vec_enc);
5761     // if src < 0 ? -1 : 1
5762     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5763     // if src == NaN, -0.0 or 0.0 return src.
5764     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5765     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5766   }
5767 }
5768 
5769 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5770   if (VM_Version::supports_avx512bw()) {
5771     if (mask_len > 32) {
5772       kmovql(dst, src);
5773     } else {
5774       kmovdl(dst, src);
5775       if (mask_len != 32) {
5776         kshiftrdl(dst, dst, 32 - mask_len);
5777       }
5778     }
5779   } else {
5780     assert(mask_len <= 16, "");
5781     kmovwl(dst, src);
5782     if (mask_len != 16) {
5783       kshiftrwl(dst, dst, 16 - mask_len);
5784     }
5785   }
5786 }
5787 
5788 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5789   int lane_size = type2aelembytes(bt);
5790   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5791   if ((is_LP64 || lane_size < 8) &&
5792       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5793        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5794     movptr(rtmp, imm32);
5795     switch(lane_size) {
5796       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5797       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5798       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5799       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5800       fatal("Unsupported lane size %d", lane_size);
5801       break;
5802     }
5803   } else {
5804     movptr(rtmp, imm32);
5805     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5806     switch(lane_size) {
5807       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5808       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5809       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5810       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5811       fatal("Unsupported lane size %d", lane_size);
5812       break;
5813     }
5814   }
5815 }
5816 
5817 //
5818 // Following is lookup table based popcount computation algorithm:-
5819 //       Index   Bit set count
5820 //     [ 0000 ->   0,
5821 //       0001 ->   1,
5822 //       0010 ->   1,
5823 //       0011 ->   2,
5824 //       0100 ->   1,
5825 //       0101 ->   2,
5826 //       0110 ->   2,
5827 //       0111 ->   3,
5828 //       1000 ->   1,
5829 //       1001 ->   2,
5830 //       1010 ->   3,
5831 //       1011 ->   3,
5832 //       1100 ->   2,
5833 //       1101 ->   3,
5834 //       1111 ->   4 ]
5835 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5836 //     shuffle indices for lookup table access.
5837 //  b. Right shift each byte of vector lane by 4 positions.
5838 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5839 //     shuffle indices for lookup table access.
5840 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5841 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5842 //     count of all the bytes of a quadword.
5843 //  f. Perform step e. for upper 128bit vector lane.
5844 //  g. Pack the bitset count of quadwords back to double word.
5845 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5846 
5847 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5848                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5849   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5850   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5851   vpsrlw(dst, src, 4, vec_enc);
5852   vpand(dst, dst, xtmp1, vec_enc);
5853   vpand(xtmp1, src, xtmp1, vec_enc);
5854   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5855   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5856   vpshufb(dst, xtmp2, dst, vec_enc);
5857   vpaddb(dst, dst, xtmp1, vec_enc);
5858 }
5859 
5860 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5861                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5862   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5863   // Following code is as per steps e,f,g and h of above algorithm.
5864   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5865   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5866   vpsadbw(dst, dst, xtmp2, vec_enc);
5867   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5868   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5869   vpackuswb(dst, xtmp1, dst, vec_enc);
5870 }
5871 
5872 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5873                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5874   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5875   // Add the popcount of upper and lower bytes of word.
5876   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5877   vpsrlw(dst, xtmp1, 8, vec_enc);
5878   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5879   vpaddw(dst, dst, xtmp1, vec_enc);
5880 }
5881 
5882 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5883                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5884   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5885   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5886   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5887 }
5888 
5889 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5890                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5891   switch(bt) {
5892     case T_LONG:
5893       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5894       break;
5895     case T_INT:
5896       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5897       break;
5898     case T_CHAR:
5899     case T_SHORT:
5900       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5901       break;
5902     case T_BYTE:
5903     case T_BOOLEAN:
5904       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5905       break;
5906     default:
5907       fatal("Unsupported type %s", type2name(bt));
5908       break;
5909   }
5910 }
5911 
5912 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5913                                                       KRegister mask, bool merge, int vec_enc) {
5914   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5915   switch(bt) {
5916     case T_LONG:
5917       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5918       evpopcntq(dst, mask, src, merge, vec_enc);
5919       break;
5920     case T_INT:
5921       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5922       evpopcntd(dst, mask, src, merge, vec_enc);
5923       break;
5924     case T_CHAR:
5925     case T_SHORT:
5926       assert(VM_Version::supports_avx512_bitalg(), "");
5927       evpopcntw(dst, mask, src, merge, vec_enc);
5928       break;
5929     case T_BYTE:
5930     case T_BOOLEAN:
5931       assert(VM_Version::supports_avx512_bitalg(), "");
5932       evpopcntb(dst, mask, src, merge, vec_enc);
5933       break;
5934     default:
5935       fatal("Unsupported type %s", type2name(bt));
5936       break;
5937   }
5938 }
5939 
5940 #ifndef _LP64
5941 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
5942   assert(VM_Version::supports_avx512bw(), "");
5943   kmovdl(tmp, src);
5944   kunpckdql(dst, tmp, tmp);
5945 }
5946 #endif
5947 
5948 // Bit reversal algorithm first reverses the bits of each byte followed by
5949 // a byte level reversal for multi-byte primitive types (short/int/long).
5950 // Algorithm performs a lookup table access to get reverse bit sequence
5951 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5952 // is obtained by swapping the reverse bit sequences of upper and lower
5953 // nibble of a byte.
5954 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5955                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5956   if (VM_Version::supports_avx512vlbw()) {
5957 
5958     // Get the reverse bit sequence of lower nibble of each byte.
5959     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5960     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5961     evpandq(dst, xtmp2, src, vec_enc);
5962     vpshufb(dst, xtmp1, dst, vec_enc);
5963     vpsllq(dst, dst, 4, vec_enc);
5964 
5965     // Get the reverse bit sequence of upper nibble of each byte.
5966     vpandn(xtmp2, xtmp2, src, vec_enc);
5967     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5968     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5969 
5970     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5971     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5972     evporq(xtmp2, dst, xtmp2, vec_enc);
5973     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5974 
5975   } else if(vec_enc == Assembler::AVX_512bit) {
5976     // Shift based bit reversal.
5977     assert(bt == T_LONG || bt == T_INT, "");
5978 
5979     // Swap lower and upper nibble of each byte.
5980     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5981 
5982     // Swap two least and most significant bits of each nibble.
5983     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5984 
5985     // Swap adjacent pair of bits.
5986     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5987     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5988 
5989     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5990     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5991   } else {
5992     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5993     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5994 
5995     // Get the reverse bit sequence of lower nibble of each byte.
5996     vpand(dst, xtmp2, src, vec_enc);
5997     vpshufb(dst, xtmp1, dst, vec_enc);
5998     vpsllq(dst, dst, 4, vec_enc);
5999 
6000     // Get the reverse bit sequence of upper nibble of each byte.
6001     vpandn(xtmp2, xtmp2, src, vec_enc);
6002     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6003     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6004 
6005     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6006     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6007     vpor(xtmp2, dst, xtmp2, vec_enc);
6008     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6009   }
6010 }
6011 
6012 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
6013                                                 XMMRegister xtmp, Register rscratch) {
6014   assert(VM_Version::supports_gfni(), "");
6015   assert(rscratch != noreg || always_reachable(mask), "missing");
6016 
6017   // Galois field instruction based bit reversal based on following algorithm.
6018   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6019   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
6020   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
6021   vector_reverse_byte(bt, dst, xtmp, vec_enc);
6022 }
6023 
6024 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
6025                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
6026   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6027   evpandq(dst, xtmp1, src, vec_enc);
6028   vpsllq(dst, dst, nbits, vec_enc);
6029   vpandn(xtmp1, xtmp1, src, vec_enc);
6030   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6031   evporq(dst, dst, xtmp1, vec_enc);
6032 }
6033 
6034 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6035                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
6036   // Shift based bit reversal.
6037   assert(VM_Version::supports_evex(), "");
6038   switch(bt) {
6039     case T_LONG:
6040       // Swap upper and lower double word of each quad word.
6041       evprorq(xtmp1, k0, src, 32, true, vec_enc);
6042       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6043       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6044       break;
6045     case T_INT:
6046       // Swap upper and lower word of each double word.
6047       evprord(xtmp1, k0, src, 16, true, vec_enc);
6048       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6049       break;
6050     case T_CHAR:
6051     case T_SHORT:
6052       // Swap upper and lower byte of each word.
6053       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6054       break;
6055     case T_BYTE:
6056       evmovdquq(dst, k0, src, true, vec_enc);
6057       break;
6058     default:
6059       fatal("Unsupported type %s", type2name(bt));
6060       break;
6061   }
6062 }
6063 
6064 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6065   if (bt == T_BYTE) {
6066     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6067       evmovdquq(dst, k0, src, true, vec_enc);
6068     } else {
6069       vmovdqu(dst, src);
6070     }
6071     return;
6072   }
6073   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6074   // pre-computed shuffle indices.
6075   switch(bt) {
6076     case T_LONG:
6077       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6078       break;
6079     case T_INT:
6080       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6081       break;
6082     case T_CHAR:
6083     case T_SHORT:
6084       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6085       break;
6086     default:
6087       fatal("Unsupported type %s", type2name(bt));
6088       break;
6089   }
6090   vpshufb(dst, src, dst, vec_enc);
6091 }
6092 
6093 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6094                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6095                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6096   assert(is_integral_type(bt), "");
6097   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6098   assert(VM_Version::supports_avx512cd(), "");
6099   switch(bt) {
6100     case T_LONG:
6101       evplzcntq(dst, ktmp, src, merge, vec_enc);
6102       break;
6103     case T_INT:
6104       evplzcntd(dst, ktmp, src, merge, vec_enc);
6105       break;
6106     case T_SHORT:
6107       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6108       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6109       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6110       vpunpckhwd(dst, xtmp1, src, vec_enc);
6111       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6112       vpackusdw(dst, xtmp2, dst, vec_enc);
6113       break;
6114     case T_BYTE:
6115       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6116       // accessing the lookup table.
6117       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6118       // accessing the lookup table.
6119       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6120       assert(VM_Version::supports_avx512bw(), "");
6121       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6122       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6123       vpand(xtmp2, dst, src, vec_enc);
6124       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6125       vpsrlw(xtmp3, src, 4, vec_enc);
6126       vpand(xtmp3, dst, xtmp3, vec_enc);
6127       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6128       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6129       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6130       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6131       break;
6132     default:
6133       fatal("Unsupported type %s", type2name(bt));
6134       break;
6135   }
6136 }
6137 
6138 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6139                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6140   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6141   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6142   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6143   // accessing the lookup table.
6144   vpand(dst, xtmp2, src, vec_enc);
6145   vpshufb(dst, xtmp1, dst, vec_enc);
6146   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6147   // accessing the lookup table.
6148   vpsrlw(xtmp3, src, 4, vec_enc);
6149   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6150   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6151   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6152   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6153   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6154   vpaddb(dst, dst, xtmp2, vec_enc);
6155   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6156 }
6157 
6158 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6159                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6160   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6161   // Add zero counts of lower byte and upper byte of a word if
6162   // upper byte holds a zero value.
6163   vpsrlw(xtmp3, src, 8, vec_enc);
6164   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6165   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6166   vpsllw(xtmp2, dst, 8, vec_enc);
6167   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6168   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6169   vpsrlw(dst, dst, 8, vec_enc);
6170 }
6171 
6172 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6173                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6174   // Since IEEE 754 floating point format represents mantissa in 1.0 format
6175   // hence biased exponent can be used to compute leading zero count as per
6176   // following formula:-
6177   // LZCNT = 32 - (biased_exp - 127)
6178   // Special handling has been introduced for Zero, Max_Int and -ve source values.
6179 
6180   // Broadcast 0xFF
6181   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6182   vpsrld(xtmp1, xtmp1, 24, vec_enc);
6183 
6184   // Extract biased exponent.
6185   vcvtdq2ps(dst, src, vec_enc);
6186   vpsrld(dst, dst, 23, vec_enc);
6187   vpand(dst, dst, xtmp1, vec_enc);
6188 
6189   // Broadcast 127.
6190   vpsrld(xtmp1, xtmp1, 1, vec_enc);
6191   // Exponent = biased_exp - 127
6192   vpsubd(dst, dst, xtmp1, vec_enc);
6193 
6194   // Exponent = Exponent  + 1
6195   vpsrld(xtmp3, xtmp1, 6, vec_enc);
6196   vpaddd(dst, dst, xtmp3, vec_enc);
6197 
6198   // Replace -ve exponent with zero, exponent is -ve when src
6199   // lane contains a zero value.
6200   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6201   vblendvps(dst, dst, xtmp2, dst, vec_enc);
6202 
6203   // Rematerialize broadcast 32.
6204   vpslld(xtmp1, xtmp3, 5, vec_enc);
6205   // Exponent is 32 if corresponding source lane contains max_int value.
6206   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
6207   // LZCNT = 32 - exponent
6208   vpsubd(dst, xtmp1, dst, vec_enc);
6209 
6210   // Replace LZCNT with a value 1 if corresponding source lane
6211   // contains max_int value.
6212   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
6213 
6214   // Replace biased_exp with 0 if source lane value is less than zero.
6215   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6216   vblendvps(dst, dst, xtmp2, src, vec_enc);
6217 }
6218 
6219 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6220                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6221   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6222   // Add zero counts of lower word and upper word of a double word if
6223   // upper word holds a zero value.
6224   vpsrld(xtmp3, src, 16, vec_enc);
6225   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6226   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6227   vpslld(xtmp2, dst, 16, vec_enc);
6228   vpaddd(xtmp2, xtmp2, dst, vec_enc);
6229   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6230   vpsrld(dst, dst, 16, vec_enc);
6231   // Add zero counts of lower doubleword and upper doubleword of a
6232   // quadword if upper doubleword holds a zero value.
6233   vpsrlq(xtmp3, src, 32, vec_enc);
6234   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6235   vpsllq(xtmp2, dst, 32, vec_enc);
6236   vpaddq(xtmp2, xtmp2, dst, vec_enc);
6237   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6238   vpsrlq(dst, dst, 32, vec_enc);
6239 }
6240 
6241 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6242                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6243                                                        Register rtmp, int vec_enc) {
6244   assert(is_integral_type(bt), "unexpected type");
6245   assert(vec_enc < Assembler::AVX_512bit, "");
6246   switch(bt) {
6247     case T_LONG:
6248       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6249       break;
6250     case T_INT:
6251       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6252       break;
6253     case T_SHORT:
6254       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6255       break;
6256     case T_BYTE:
6257       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6258       break;
6259     default:
6260       fatal("Unsupported type %s", type2name(bt));
6261       break;
6262   }
6263 }
6264 
6265 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6266   switch(bt) {
6267     case T_BYTE:
6268       vpsubb(dst, src1, src2, vec_enc);
6269       break;
6270     case T_SHORT:
6271       vpsubw(dst, src1, src2, vec_enc);
6272       break;
6273     case T_INT:
6274       vpsubd(dst, src1, src2, vec_enc);
6275       break;
6276     case T_LONG:
6277       vpsubq(dst, src1, src2, vec_enc);
6278       break;
6279     default:
6280       fatal("Unsupported type %s", type2name(bt));
6281       break;
6282   }
6283 }
6284 
6285 // Trailing zero count computation is based on leading zero count operation as per
6286 // following equation. All AVX3 targets support AVX512CD feature which offers
6287 // direct vector instruction to compute leading zero count.
6288 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6289 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6290                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6291                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6292   assert(is_integral_type(bt), "");
6293   // xtmp = -1
6294   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6295   // xtmp = xtmp + src
6296   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6297   // xtmp = xtmp & ~src
6298   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6299   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6300   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6301   vpsub(bt, dst, xtmp4, dst, vec_enc);
6302 }
6303 
6304 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6305 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6306 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6307                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6308   assert(is_integral_type(bt), "");
6309   // xtmp = 0
6310   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6311   // xtmp = 0 - src
6312   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6313   // xtmp = xtmp | src
6314   vpor(xtmp3, xtmp3, src, vec_enc);
6315   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6316   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6317   vpsub(bt, dst, xtmp1, dst, vec_enc);
6318 }
6319 
6320 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6321   Label done;
6322   Label neg_divisor_fastpath;
6323   cmpl(divisor, 0);
6324   jccb(Assembler::less, neg_divisor_fastpath);
6325   xorl(rdx, rdx);
6326   divl(divisor);
6327   jmpb(done);
6328   bind(neg_divisor_fastpath);
6329   // Fastpath for divisor < 0:
6330   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6331   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6332   movl(rdx, rax);
6333   subl(rdx, divisor);
6334   if (VM_Version::supports_bmi1()) {
6335     andnl(rax, rdx, rax);
6336   } else {
6337     notl(rdx);
6338     andl(rax, rdx);
6339   }
6340   shrl(rax, 31);
6341   bind(done);
6342 }
6343 
6344 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6345   Label done;
6346   Label neg_divisor_fastpath;
6347   cmpl(divisor, 0);
6348   jccb(Assembler::less, neg_divisor_fastpath);
6349   xorl(rdx, rdx);
6350   divl(divisor);
6351   jmpb(done);
6352   bind(neg_divisor_fastpath);
6353   // Fastpath when divisor < 0:
6354   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6355   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6356   movl(rdx, rax);
6357   subl(rax, divisor);
6358   if (VM_Version::supports_bmi1()) {
6359     andnl(rax, rax, rdx);
6360   } else {
6361     notl(rax);
6362     andl(rax, rdx);
6363   }
6364   sarl(rax, 31);
6365   andl(rax, divisor);
6366   subl(rdx, rax);
6367   bind(done);
6368 }
6369 
6370 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6371   Label done;
6372   Label neg_divisor_fastpath;
6373 
6374   cmpl(divisor, 0);
6375   jccb(Assembler::less, neg_divisor_fastpath);
6376   xorl(rdx, rdx);
6377   divl(divisor);
6378   jmpb(done);
6379   bind(neg_divisor_fastpath);
6380   // Fastpath for divisor < 0:
6381   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6382   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6383   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6384   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6385   movl(rdx, rax);
6386   subl(rax, divisor);
6387   if (VM_Version::supports_bmi1()) {
6388     andnl(rax, rax, rdx);
6389   } else {
6390     notl(rax);
6391     andl(rax, rdx);
6392   }
6393   movl(tmp, rax);
6394   shrl(rax, 31); // quotient
6395   sarl(tmp, 31);
6396   andl(tmp, divisor);
6397   subl(rdx, tmp); // remainder
6398   bind(done);
6399 }
6400 
6401 #ifdef _LP64
6402 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6403                                  XMMRegister xtmp2, Register rtmp) {
6404   if(VM_Version::supports_gfni()) {
6405     // Galois field instruction based bit reversal based on following algorithm.
6406     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6407     mov64(rtmp, 0x8040201008040201L);
6408     movq(xtmp1, src);
6409     movq(xtmp2, rtmp);
6410     gf2p8affineqb(xtmp1, xtmp2, 0);
6411     movq(dst, xtmp1);
6412   } else {
6413     // Swap even and odd numbered bits.
6414     movl(rtmp, src);
6415     andl(rtmp, 0x55555555);
6416     shll(rtmp, 1);
6417     movl(dst, src);
6418     andl(dst, 0xAAAAAAAA);
6419     shrl(dst, 1);
6420     orl(dst, rtmp);
6421 
6422     // Swap LSB and MSB 2 bits of each nibble.
6423     movl(rtmp, dst);
6424     andl(rtmp, 0x33333333);
6425     shll(rtmp, 2);
6426     andl(dst, 0xCCCCCCCC);
6427     shrl(dst, 2);
6428     orl(dst, rtmp);
6429 
6430     // Swap LSB and MSB 4 bits of each byte.
6431     movl(rtmp, dst);
6432     andl(rtmp, 0x0F0F0F0F);
6433     shll(rtmp, 4);
6434     andl(dst, 0xF0F0F0F0);
6435     shrl(dst, 4);
6436     orl(dst, rtmp);
6437   }
6438   bswapl(dst);
6439 }
6440 
6441 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6442                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6443   if(VM_Version::supports_gfni()) {
6444     // Galois field instruction based bit reversal based on following algorithm.
6445     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6446     mov64(rtmp1, 0x8040201008040201L);
6447     movq(xtmp1, src);
6448     movq(xtmp2, rtmp1);
6449     gf2p8affineqb(xtmp1, xtmp2, 0);
6450     movq(dst, xtmp1);
6451   } else {
6452     // Swap even and odd numbered bits.
6453     movq(rtmp1, src);
6454     mov64(rtmp2, 0x5555555555555555L);
6455     andq(rtmp1, rtmp2);
6456     shlq(rtmp1, 1);
6457     movq(dst, src);
6458     notq(rtmp2);
6459     andq(dst, rtmp2);
6460     shrq(dst, 1);
6461     orq(dst, rtmp1);
6462 
6463     // Swap LSB and MSB 2 bits of each nibble.
6464     movq(rtmp1, dst);
6465     mov64(rtmp2, 0x3333333333333333L);
6466     andq(rtmp1, rtmp2);
6467     shlq(rtmp1, 2);
6468     notq(rtmp2);
6469     andq(dst, rtmp2);
6470     shrq(dst, 2);
6471     orq(dst, rtmp1);
6472 
6473     // Swap LSB and MSB 4 bits of each byte.
6474     movq(rtmp1, dst);
6475     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6476     andq(rtmp1, rtmp2);
6477     shlq(rtmp1, 4);
6478     notq(rtmp2);
6479     andq(dst, rtmp2);
6480     shrq(dst, 4);
6481     orq(dst, rtmp1);
6482   }
6483   bswapq(dst);
6484 }
6485 
6486 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6487   Label done;
6488   Label neg_divisor_fastpath;
6489   cmpq(divisor, 0);
6490   jccb(Assembler::less, neg_divisor_fastpath);
6491   xorl(rdx, rdx);
6492   divq(divisor);
6493   jmpb(done);
6494   bind(neg_divisor_fastpath);
6495   // Fastpath for divisor < 0:
6496   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6497   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6498   movq(rdx, rax);
6499   subq(rdx, divisor);
6500   if (VM_Version::supports_bmi1()) {
6501     andnq(rax, rdx, rax);
6502   } else {
6503     notq(rdx);
6504     andq(rax, rdx);
6505   }
6506   shrq(rax, 63);
6507   bind(done);
6508 }
6509 
6510 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6511   Label done;
6512   Label neg_divisor_fastpath;
6513   cmpq(divisor, 0);
6514   jccb(Assembler::less, neg_divisor_fastpath);
6515   xorq(rdx, rdx);
6516   divq(divisor);
6517   jmp(done);
6518   bind(neg_divisor_fastpath);
6519   // Fastpath when divisor < 0:
6520   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6521   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6522   movq(rdx, rax);
6523   subq(rax, divisor);
6524   if (VM_Version::supports_bmi1()) {
6525     andnq(rax, rax, rdx);
6526   } else {
6527     notq(rax);
6528     andq(rax, rdx);
6529   }
6530   sarq(rax, 63);
6531   andq(rax, divisor);
6532   subq(rdx, rax);
6533   bind(done);
6534 }
6535 
6536 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6537   Label done;
6538   Label neg_divisor_fastpath;
6539   cmpq(divisor, 0);
6540   jccb(Assembler::less, neg_divisor_fastpath);
6541   xorq(rdx, rdx);
6542   divq(divisor);
6543   jmp(done);
6544   bind(neg_divisor_fastpath);
6545   // Fastpath for divisor < 0:
6546   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6547   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6548   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6549   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6550   movq(rdx, rax);
6551   subq(rax, divisor);
6552   if (VM_Version::supports_bmi1()) {
6553     andnq(rax, rax, rdx);
6554   } else {
6555     notq(rax);
6556     andq(rax, rdx);
6557   }
6558   movq(tmp, rax);
6559   shrq(rax, 63); // quotient
6560   sarq(tmp, 63);
6561   andq(tmp, divisor);
6562   subq(rdx, tmp); // remainder
6563   bind(done);
6564 }
6565 #endif
6566 
6567 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6568                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6569                                         int vlen_enc) {
6570   assert(VM_Version::supports_avx512bw(), "");
6571   // Byte shuffles are inlane operations and indices are determined using
6572   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6573   // normalized to index range 0-15. This makes sure that all the multiples
6574   // of an index value are placed at same relative position in 128 bit
6575   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6576   // will be 16th element in their respective 128 bit lanes.
6577   movl(rtmp, 16);
6578   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6579 
6580   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6581   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6582   // original shuffle indices and move the shuffled lanes corresponding to true
6583   // mask to destination vector.
6584   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6585   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6586   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6587 
6588   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6589   // and broadcasting second 128 bit lane.
6590   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6591   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6592   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6593   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6594   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6595 
6596   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6597   // and broadcasting third 128 bit lane.
6598   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6599   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6600   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6601   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6602   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6603 
6604   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6605   // and broadcasting third 128 bit lane.
6606   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6607   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6608   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6609   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6610   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6611 }
6612 
6613 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6614                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6615   if (vlen_enc == AVX_128bit) {
6616     vpermilps(dst, src, shuffle, vlen_enc);
6617   } else if (bt == T_INT) {
6618     vpermd(dst, shuffle, src, vlen_enc);
6619   } else {
6620     assert(bt == T_FLOAT, "");
6621     vpermps(dst, shuffle, src, vlen_enc);
6622   }
6623 }