1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/globals.hpp"
  37 #include "runtime/objectMonitor.hpp"
  38 #include "runtime/stubRoutines.hpp"
  39 #include "utilities/checkedCast.hpp"
  40 #include "utilities/globalDefinitions.hpp"
  41 #include "utilities/powerOfTwo.hpp"
  42 #include "utilities/sizes.hpp"
  43 
  44 #ifdef PRODUCT
  45 #define BLOCK_COMMENT(str) /* nothing */
  46 #define STOP(error) stop(error)
  47 #else
  48 #define BLOCK_COMMENT(str) block_comment(str)
  49 #define STOP(error) block_comment(error); stop(error)
  50 #endif
  51 
  52 // C2 compiled method's prolog code.
  53 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  54 
  55   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  56   // NativeJump::patch_verified_entry will be able to patch out the entry
  57   // code safely. The push to verify stack depth is ok at 5 bytes,
  58   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  59   // stack bang then we must use the 6 byte frame allocation even if
  60   // we have no frame. :-(
  61   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  62 
  63   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  64   // Remove word for return addr
  65   framesize -= wordSize;
  66   stack_bang_size -= wordSize;
  67 
  68   // Calls to C2R adapters often do not accept exceptional returns.
  69   // We require that their callers must bang for them.  But be careful, because
  70   // some VM calls (such as call site linkage) can use several kilobytes of
  71   // stack.  But the stack safety zone should account for that.
  72   // See bugs 4446381, 4468289, 4497237.
  73   if (stack_bang_size > 0) {
  74     generate_stack_overflow_check(stack_bang_size);
  75 
  76     // We always push rbp, so that on return to interpreter rbp, will be
  77     // restored correctly and we can correct the stack.
  78     push(rbp);
  79     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  80     if (PreserveFramePointer) {
  81       mov(rbp, rsp);
  82     }
  83     // Remove word for ebp
  84     framesize -= wordSize;
  85 
  86     // Create frame
  87     if (framesize) {
  88       subptr(rsp, framesize);
  89     }
  90   } else {
  91     // Create frame (force generation of a 4 byte immediate value)
  92     subptr_imm32(rsp, framesize);
  93 
  94     // Save RBP register now.
  95     framesize -= wordSize;
  96     movptr(Address(rsp, framesize), rbp);
  97     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  98     if (PreserveFramePointer) {
  99       movptr(rbp, rsp);
 100       if (framesize > 0) {
 101         addptr(rbp, framesize);
 102       }
 103     }
 104   }
 105 
 106   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 107     framesize -= wordSize;
 108     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 109   }
 110 
 111 #ifndef _LP64
 112   // If method sets FPU control word do it now
 113   if (fp_mode_24b) {
 114     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 115   }
 116   if (UseSSE >= 2 && VerifyFPU) {
 117     verify_FPU(0, "FPU stack must be clean on entry");
 118   }
 119 #endif
 120 
 121 #ifdef ASSERT
 122   if (VerifyStackAtCalls) {
 123     Label L;
 124     push(rax);
 125     mov(rax, rsp);
 126     andptr(rax, StackAlignmentInBytes-1);
 127     cmpptr(rax, StackAlignmentInBytes-wordSize);
 128     pop(rax);
 129     jcc(Assembler::equal, L);
 130     STOP("Stack is not properly aligned!");
 131     bind(L);
 132   }
 133 #endif
 134 
 135   if (!is_stub) {
 136     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 137  #ifdef _LP64
 138     if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
 139       // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 140       Label dummy_slow_path;
 141       Label dummy_continuation;
 142       Label* slow_path = &dummy_slow_path;
 143       Label* continuation = &dummy_continuation;
 144       if (!Compile::current()->output()->in_scratch_emit_size()) {
 145         // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 146         C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 147         Compile::current()->output()->add_stub(stub);
 148         slow_path = &stub->entry();
 149         continuation = &stub->continuation();
 150       }
 151       bs->nmethod_entry_barrier(this, slow_path, continuation);
 152     }
 153 #else
 154     // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 155     bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */);
 156 #endif
 157   }
 158 }
 159 
 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 161   switch (vlen_in_bytes) {
 162     case  4: // fall-through
 163     case  8: // fall-through
 164     case 16: return Assembler::AVX_128bit;
 165     case 32: return Assembler::AVX_256bit;
 166     case 64: return Assembler::AVX_512bit;
 167 
 168     default: {
 169       ShouldNotReachHere();
 170       return Assembler::AVX_NoVec;
 171     }
 172   }
 173 }
 174 
 175 #if INCLUDE_RTM_OPT
 176 
 177 // Update rtm_counters based on abort status
 178 // input: abort_status
 179 //        rtm_counters (RTMLockingCounters*)
 180 // flags are killed
 181 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
 182 
 183   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
 184   if (PrintPreciseRTMLockingStatistics) {
 185     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
 186       Label check_abort;
 187       testl(abort_status, (1<<i));
 188       jccb(Assembler::equal, check_abort);
 189       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
 190       bind(check_abort);
 191     }
 192   }
 193 }
 194 
 195 // Branch if (random & (count-1) != 0), count is 2^n
 196 // tmp, scr and flags are killed
 197 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
 198   assert(tmp == rax, "");
 199   assert(scr == rdx, "");
 200   rdtsc(); // modifies EDX:EAX
 201   andptr(tmp, count-1);
 202   jccb(Assembler::notZero, brLabel);
 203 }
 204 
 205 // Perform abort ratio calculation, set no_rtm bit if high ratio
 206 // input:  rtm_counters_Reg (RTMLockingCounters* address)
 207 // tmpReg, rtm_counters_Reg and flags are killed
 208 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
 209                                                     Register rtm_counters_Reg,
 210                                                     RTMLockingCounters* rtm_counters,
 211                                                     Metadata* method_data) {
 212   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 213 
 214   if (RTMLockingCalculationDelay > 0) {
 215     // Delay calculation
 216     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()));
 217     testptr(tmpReg, tmpReg);
 218     jccb(Assembler::equal, L_done);
 219   }
 220   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 221   //   Aborted transactions = abort_count * 100
 222   //   All transactions = total_count *  RTMTotalCountIncrRate
 223   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 224 
 225   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 226   cmpptr(tmpReg, RTMAbortThreshold);
 227   jccb(Assembler::below, L_check_always_rtm2);
 228   imulptr(tmpReg, tmpReg, 100);
 229 
 230   Register scrReg = rtm_counters_Reg;
 231   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 232   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 233   imulptr(scrReg, scrReg, RTMAbortRatio);
 234   cmpptr(tmpReg, scrReg);
 235   jccb(Assembler::below, L_check_always_rtm1);
 236   if (method_data != nullptr) {
 237     // set rtm_state to "no rtm" in MDO
 238     mov_metadata(tmpReg, method_data);
 239     lock();
 240     orl(Address(tmpReg, MethodData::rtm_state_offset()), NoRTM);
 241   }
 242   jmpb(L_done);
 243   bind(L_check_always_rtm1);
 244   // Reload RTMLockingCounters* address
 245   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 246   bind(L_check_always_rtm2);
 247   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 248   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 249   jccb(Assembler::below, L_done);
 250   if (method_data != nullptr) {
 251     // set rtm_state to "always rtm" in MDO
 252     mov_metadata(tmpReg, method_data);
 253     lock();
 254     orl(Address(tmpReg, MethodData::rtm_state_offset()), UseRTM);
 255   }
 256   bind(L_done);
 257 }
 258 
 259 // Update counters and perform abort ratio calculation
 260 // input:  abort_status_Reg
 261 // rtm_counters_Reg, flags are killed
 262 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 263                                       Register rtm_counters_Reg,
 264                                       RTMLockingCounters* rtm_counters,
 265                                       Metadata* method_data,
 266                                       bool profile_rtm) {
 267 
 268   assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 269   // update rtm counters based on rax value at abort
 270   // reads abort_status_Reg, updates flags
 271   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 272   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 273   if (profile_rtm) {
 274     // Save abort status because abort_status_Reg is used by following code.
 275     if (RTMRetryCount > 0) {
 276       push(abort_status_Reg);
 277     }
 278     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 279     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 280     // restore abort status
 281     if (RTMRetryCount > 0) {
 282       pop(abort_status_Reg);
 283     }
 284   }
 285 }
 286 
 287 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 288 // inputs: retry_count_Reg
 289 //       : abort_status_Reg
 290 // output: retry_count_Reg decremented by 1
 291 // flags are killed
 292 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 293   Label doneRetry;
 294   assert(abort_status_Reg == rax, "");
 295   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 296   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 297   // if reason is in 0x6 and retry count != 0 then retry
 298   andptr(abort_status_Reg, 0x6);
 299   jccb(Assembler::zero, doneRetry);
 300   testl(retry_count_Reg, retry_count_Reg);
 301   jccb(Assembler::zero, doneRetry);
 302   pause();
 303   decrementl(retry_count_Reg);
 304   jmp(retryLabel);
 305   bind(doneRetry);
 306 }
 307 
 308 // Spin and retry if lock is busy,
 309 // inputs: box_Reg (monitor address)
 310 //       : retry_count_Reg
 311 // output: retry_count_Reg decremented by 1
 312 //       : clear z flag if retry count exceeded
 313 // tmp_Reg, scr_Reg, flags are killed
 314 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 315                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 316   Label SpinLoop, SpinExit, doneRetry;
 317   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 318 
 319   testl(retry_count_Reg, retry_count_Reg);
 320   jccb(Assembler::zero, doneRetry);
 321   decrementl(retry_count_Reg);
 322   movptr(scr_Reg, RTMSpinLoopCount);
 323 
 324   bind(SpinLoop);
 325   pause();
 326   decrementl(scr_Reg);
 327   jccb(Assembler::lessEqual, SpinExit);
 328   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 329   testptr(tmp_Reg, tmp_Reg);
 330   jccb(Assembler::notZero, SpinLoop);
 331 
 332   bind(SpinExit);
 333   jmp(retryLabel);
 334   bind(doneRetry);
 335   incrementl(retry_count_Reg); // clear z flag
 336 }
 337 
 338 // Use RTM for normal stack locks
 339 // Input: objReg (object to lock)
 340 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 341                                          Register retry_on_abort_count_Reg,
 342                                          RTMLockingCounters* stack_rtm_counters,
 343                                          Metadata* method_data, bool profile_rtm,
 344                                          Label& DONE_LABEL, Label& IsInflated) {
 345   assert(UseRTMForStackLocks, "why call this otherwise?");
 346   assert(tmpReg == rax, "");
 347   assert(scrReg == rdx, "");
 348   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 349 
 350   if (RTMRetryCount > 0) {
 351     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 352     bind(L_rtm_retry);
 353   }
 354   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 355   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral
 356   jcc(Assembler::notZero, IsInflated);
 357 
 358   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 359     Label L_noincrement;
 360     if (RTMTotalCountIncrRate > 1) {
 361       // tmpReg, scrReg and flags are killed
 362       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 363     }
 364     assert(stack_rtm_counters != nullptr, "should not be null when profiling RTM");
 365     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 366     bind(L_noincrement);
 367   }
 368   xbegin(L_on_abort);
 369   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 370   andptr(tmpReg, markWord::lock_mask_in_place);     // look at 2 lock bits
 371   cmpptr(tmpReg, markWord::unlocked_value);         // bits = 01 unlocked
 372   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 373 
 374   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 375   if (UseRTMXendForLockBusy) {
 376     xend();
 377     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 378     jmp(L_decrement_retry);
 379   }
 380   else {
 381     xabort(0);
 382   }
 383   bind(L_on_abort);
 384   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 385     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 386   }
 387   bind(L_decrement_retry);
 388   if (RTMRetryCount > 0) {
 389     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 390     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 391   }
 392 }
 393 
 394 // Use RTM for inflating locks
 395 // inputs: objReg (object to lock)
 396 //         boxReg (on-stack box address (displaced header location) - KILLED)
 397 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 398 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 399                                             Register scrReg, Register retry_on_busy_count_Reg,
 400                                             Register retry_on_abort_count_Reg,
 401                                             RTMLockingCounters* rtm_counters,
 402                                             Metadata* method_data, bool profile_rtm,
 403                                             Label& DONE_LABEL) {
 404   assert(UseRTMLocking, "why call this otherwise?");
 405   assert(tmpReg == rax, "");
 406   assert(scrReg == rdx, "");
 407   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 408   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 409 
 410   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 411   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 412 
 413   if (RTMRetryCount > 0) {
 414     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 415     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 416     bind(L_rtm_retry);
 417   }
 418   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 419     Label L_noincrement;
 420     if (RTMTotalCountIncrRate > 1) {
 421       // tmpReg, scrReg and flags are killed
 422       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 423     }
 424     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 425     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 426     bind(L_noincrement);
 427   }
 428   xbegin(L_on_abort);
 429   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 430   movptr(tmpReg, Address(tmpReg, owner_offset));
 431   testptr(tmpReg, tmpReg);
 432   jcc(Assembler::zero, DONE_LABEL);
 433   if (UseRTMXendForLockBusy) {
 434     xend();
 435     jmp(L_decrement_retry);
 436   }
 437   else {
 438     xabort(0);
 439   }
 440   bind(L_on_abort);
 441   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 442   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 443     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 444   }
 445   if (RTMRetryCount > 0) {
 446     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 447     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 448   }
 449 
 450   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 451   testptr(tmpReg, tmpReg) ;
 452   jccb(Assembler::notZero, L_decrement_retry) ;
 453 
 454   // Appears unlocked - try to swing _owner from null to non-null.
 455   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 456 #ifdef _LP64
 457   Register threadReg = r15_thread;
 458 #else
 459   get_thread(scrReg);
 460   Register threadReg = scrReg;
 461 #endif
 462   lock();
 463   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 464 
 465   if (RTMRetryCount > 0) {
 466     // success done else retry
 467     jccb(Assembler::equal, DONE_LABEL) ;
 468     bind(L_decrement_retry);
 469     // Spin and retry if lock is busy.
 470     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 471   }
 472   else {
 473     bind(L_decrement_retry);
 474   }
 475 }
 476 
 477 #endif //  INCLUDE_RTM_OPT
 478 
 479 // fast_lock and fast_unlock used by C2
 480 
 481 // Because the transitions from emitted code to the runtime
 482 // monitorenter/exit helper stubs are so slow it's critical that
 483 // we inline both the stack-locking fast path and the inflated fast path.
 484 //
 485 // See also: cmpFastLock and cmpFastUnlock.
 486 //
 487 // What follows is a specialized inline transliteration of the code
 488 // in enter() and exit(). If we're concerned about I$ bloat another
 489 // option would be to emit TrySlowEnter and TrySlowExit methods
 490 // at startup-time.  These methods would accept arguments as
 491 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 492 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 493 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 494 // In practice, however, the # of lock sites is bounded and is usually small.
 495 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 496 // if the processor uses simple bimodal branch predictors keyed by EIP
 497 // Since the helper routines would be called from multiple synchronization
 498 // sites.
 499 //
 500 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 501 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 502 // to those specialized methods.  That'd give us a mostly platform-independent
 503 // implementation that the JITs could optimize and inline at their pleasure.
 504 // Done correctly, the only time we'd need to cross to native could would be
 505 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 506 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 507 // (b) explicit barriers or fence operations.
 508 //
 509 // TODO:
 510 //
 511 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 512 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 513 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 514 //    the lock operators would typically be faster than reifying Self.
 515 //
 516 // *  Ideally I'd define the primitives as:
 517 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 518 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 519 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 520 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 521 //    Furthermore the register assignments are overconstrained, possibly resulting in
 522 //    sub-optimal code near the synchronization site.
 523 //
 524 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 525 //    Alternately, use a better sp-proximity test.
 526 //
 527 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 528 //    Either one is sufficient to uniquely identify a thread.
 529 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 530 //
 531 // *  Intrinsify notify() and notifyAll() for the common cases where the
 532 //    object is locked by the calling thread but the waitlist is empty.
 533 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 534 //
 535 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 536 //    But beware of excessive branch density on AMD Opterons.
 537 //
 538 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 539 //    or failure of the fast path.  If the fast path fails then we pass
 540 //    control to the slow path, typically in C.  In fast_lock and
 541 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 542 //    will emit a conditional branch immediately after the node.
 543 //    So we have branches to branches and lots of ICC.ZF games.
 544 //    Instead, it might be better to have C2 pass a "FailureLabel"
 545 //    into fast_lock and fast_unlock.  In the case of success, control
 546 //    will drop through the node.  ICC.ZF is undefined at exit.
 547 //    In the case of failure, the node will branch directly to the
 548 //    FailureLabel
 549 
 550 
 551 // obj: object to lock
 552 // box: on-stack box address (displaced header location) - KILLED
 553 // rax,: tmp -- KILLED
 554 // scr: tmp -- KILLED
 555 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 556                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 557                                  RTMLockingCounters* rtm_counters,
 558                                  RTMLockingCounters* stack_rtm_counters,
 559                                  Metadata* method_data,
 560                                  bool use_rtm, bool profile_rtm) {
 561   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 562   // Ensure the register assignments are disjoint
 563   assert(tmpReg == rax, "");
 564 
 565   if (use_rtm) {
 566     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 567   } else {
 568     assert(cx1Reg == noreg, "");
 569     assert(cx2Reg == noreg, "");
 570     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 571   }
 572 
 573   // Possible cases that we'll encounter in fast_lock
 574   // ------------------------------------------------
 575   // * Inflated
 576   //    -- unlocked
 577   //    -- Locked
 578   //       = by self
 579   //       = by other
 580   // * neutral
 581   // * stack-locked
 582   //    -- by self
 583   //       = sp-proximity test hits
 584   //       = sp-proximity test generates false-negative
 585   //    -- by other
 586   //
 587 
 588   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 589 
 590   if (DiagnoseSyncOnValueBasedClasses != 0) {
 591     load_klass(tmpReg, objReg, scrReg);
 592     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 593     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 594     jcc(Assembler::notZero, DONE_LABEL);
 595   }
 596 
 597 #if INCLUDE_RTM_OPT
 598   if (UseRTMForStackLocks && use_rtm) {
 599     assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive");
 600     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 601                       stack_rtm_counters, method_data, profile_rtm,
 602                       DONE_LABEL, IsInflated);
 603   }
 604 #endif // INCLUDE_RTM_OPT
 605 
 606   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 607   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 608   jcc(Assembler::notZero, IsInflated);
 609 
 610   if (LockingMode == LM_MONITOR) {
 611     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 612     testptr(objReg, objReg);
 613   } else {
 614     assert(LockingMode == LM_LEGACY, "must be");
 615     // Attempt stack-locking ...
 616     orptr (tmpReg, markWord::unlocked_value);
 617     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 618     lock();
 619     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 620     jcc(Assembler::equal, COUNT);           // Success
 621 
 622     // Recursive locking.
 623     // The object is stack-locked: markword contains stack pointer to BasicLock.
 624     // Locked by current thread if difference with current SP is less than one page.
 625     subptr(tmpReg, rsp);
 626     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 627     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) );
 628     movptr(Address(boxReg, 0), tmpReg);
 629   }
 630   jmp(DONE_LABEL);
 631 
 632   bind(IsInflated);
 633   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 634 
 635 #if INCLUDE_RTM_OPT
 636   // Use the same RTM locking code in 32- and 64-bit VM.
 637   if (use_rtm) {
 638     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 639                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 640   } else {
 641 #endif // INCLUDE_RTM_OPT
 642 
 643 #ifndef _LP64
 644   // The object is inflated.
 645 
 646   // boxReg refers to the on-stack BasicLock in the current frame.
 647   // We'd like to write:
 648   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 649   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 650   // additional latency as we have another ST in the store buffer that must drain.
 651 
 652   // avoid ST-before-CAS
 653   // register juggle because we need tmpReg for cmpxchgptr below
 654   movptr(scrReg, boxReg);
 655   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 656 
 657   // Optimistic form: consider XORL tmpReg,tmpReg
 658   movptr(tmpReg, NULL_WORD);
 659 
 660   // Appears unlocked - try to swing _owner from null to non-null.
 661   // Ideally, I'd manifest "Self" with get_thread and then attempt
 662   // to CAS the register containing Self into m->Owner.
 663   // But we don't have enough registers, so instead we can either try to CAS
 664   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 665   // we later store "Self" into m->Owner.  Transiently storing a stack address
 666   // (rsp or the address of the box) into  m->owner is harmless.
 667   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 668   lock();
 669   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 670   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 671   // If we weren't able to swing _owner from null to the BasicLock
 672   // then take the slow path.
 673   jccb  (Assembler::notZero, NO_COUNT);
 674   // update _owner from BasicLock to thread
 675   get_thread (scrReg);                    // beware: clobbers ICCs
 676   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 677   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 678 
 679   // If the CAS fails we can either retry or pass control to the slow path.
 680   // We use the latter tactic.
 681   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 682   // If the CAS was successful ...
 683   //   Self has acquired the lock
 684   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 685   // Intentional fall-through into DONE_LABEL ...
 686 #else // _LP64
 687   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 688   movq(scrReg, tmpReg);
 689   xorq(tmpReg, tmpReg);
 690   lock();
 691   cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 692   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 693   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 694   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 695   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 696   jccb(Assembler::equal, COUNT);          // CAS above succeeded; propagate ZF = 1 (success)
 697 
 698   cmpptr(thread, rax);                // Check if we are already the owner (recursive lock)
 699   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 700   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 701   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 702 #endif // _LP64
 703 #if INCLUDE_RTM_OPT
 704   } // use_rtm()
 705 #endif
 706   bind(DONE_LABEL);
 707 
 708   // ZFlag == 1 count in fast path
 709   // ZFlag == 0 count in slow path
 710   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 711 
 712   bind(COUNT);
 713   // Count monitors in fast path
 714   increment(Address(thread, JavaThread::held_monitor_count_offset()));
 715 
 716   xorl(tmpReg, tmpReg); // Set ZF == 1
 717 
 718   bind(NO_COUNT);
 719 
 720   // At NO_COUNT the icc ZFlag is set as follows ...
 721   // fast_unlock uses the same protocol.
 722   // ZFlag == 1 -> Success
 723   // ZFlag == 0 -> Failure - force control through the slow path
 724 }
 725 
 726 // obj: object to unlock
 727 // box: box address (displaced header location), killed.  Must be EAX.
 728 // tmp: killed, cannot be obj nor box.
 729 //
 730 // Some commentary on balanced locking:
 731 //
 732 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 733 // Methods that don't have provably balanced locking are forced to run in the
 734 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 735 // The interpreter provides two properties:
 736 // I1:  At return-time the interpreter automatically and quietly unlocks any
 737 //      objects acquired the current activation (frame).  Recall that the
 738 //      interpreter maintains an on-stack list of locks currently held by
 739 //      a frame.
 740 // I2:  If a method attempts to unlock an object that is not held by the
 741 //      the frame the interpreter throws IMSX.
 742 //
 743 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 744 // B() doesn't have provably balanced locking so it runs in the interpreter.
 745 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 746 // is still locked by A().
 747 //
 748 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 749 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 750 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 751 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 752 // Arguably given that the spec legislates the JNI case as undefined our implementation
 753 // could reasonably *avoid* checking owner in fast_unlock().
 754 // In the interest of performance we elide m->Owner==Self check in unlock.
 755 // A perfectly viable alternative is to elide the owner check except when
 756 // Xcheck:jni is enabled.
 757 
 758 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 759   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 760   assert(boxReg == rax, "");
 761   assert_different_registers(objReg, boxReg, tmpReg);
 762 
 763   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 764 
 765 #if INCLUDE_RTM_OPT
 766   if (UseRTMForStackLocks && use_rtm) {
 767     assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive");
 768     Label L_regular_unlock;
 769     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 770     andptr(tmpReg, markWord::lock_mask_in_place);                     // look at 2 lock bits
 771     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 01 unlocked
 772     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 773     xend();                                                           // otherwise end...
 774     jmp(DONE_LABEL);                                                  // ... and we're done
 775     bind(L_regular_unlock);
 776   }
 777 #endif
 778 
 779   if (LockingMode == LM_LEGACY) {
 780     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 781     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 782   }
 783   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 784   if (LockingMode != LM_MONITOR) {
 785     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 786     jcc(Assembler::zero, Stacked);
 787   }
 788 
 789   // It's inflated.
 790 
 791 #if INCLUDE_RTM_OPT
 792   if (use_rtm) {
 793     Label L_regular_inflated_unlock;
 794     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 795     movptr(boxReg, Address(tmpReg, owner_offset));
 796     testptr(boxReg, boxReg);
 797     jccb(Assembler::notZero, L_regular_inflated_unlock);
 798     xend();
 799     jmp(DONE_LABEL);
 800     bind(L_regular_inflated_unlock);
 801   }
 802 #endif
 803 
 804   // Despite our balanced locking property we still check that m->_owner == Self
 805   // as java routines or native JNI code called by this thread might
 806   // have released the lock.
 807   // Refer to the comments in synchronizer.cpp for how we might encode extra
 808   // state in _succ so we can avoid fetching EntryList|cxq.
 809   //
 810   // If there's no contention try a 1-0 exit.  That is, exit without
 811   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 812   // we detect and recover from the race that the 1-0 exit admits.
 813   //
 814   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 815   // before it STs null into _owner, releasing the lock.  Updates
 816   // to data protected by the critical section must be visible before
 817   // we drop the lock (and thus before any other thread could acquire
 818   // the lock and observe the fields protected by the lock).
 819   // IA32's memory-model is SPO, so STs are ordered with respect to
 820   // each other and there's no need for an explicit barrier (fence).
 821   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 822 #ifndef _LP64
 823   // Note that we could employ various encoding schemes to reduce
 824   // the number of loads below (currently 4) to just 2 or 3.
 825   // Refer to the comments in synchronizer.cpp.
 826   // In practice the chain of fetches doesn't seem to impact performance, however.
 827   xorptr(boxReg, boxReg);
 828   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 829   jccb  (Assembler::notZero, DONE_LABEL);
 830   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 831   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 832   jccb  (Assembler::notZero, DONE_LABEL);
 833   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 834   jmpb  (DONE_LABEL);
 835 #else // _LP64
 836   // It's inflated
 837   Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath;
 838 
 839   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 840   jccb(Assembler::equal, LNotRecursive);
 841 
 842   // Recursive inflated unlock
 843   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 844   jmpb(LSuccess);
 845 
 846   bind(LNotRecursive);
 847   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 848   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 849   jccb  (Assembler::notZero, CheckSucc);
 850   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 851   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 852   jmpb  (DONE_LABEL);
 853 
 854   // Try to avoid passing control into the slow_path ...
 855   bind  (CheckSucc);
 856 
 857   // The following optional optimization can be elided if necessary
 858   // Effectively: if (succ == null) goto slow path
 859   // The code reduces the window for a race, however,
 860   // and thus benefits performance.
 861   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 862   jccb  (Assembler::zero, LGoSlowPath);
 863 
 864   xorptr(boxReg, boxReg);
 865   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 866   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 867 
 868   // Memory barrier/fence
 869   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 870   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 871   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 872   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 873   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 874   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 875   lock(); addl(Address(rsp, 0), 0);
 876 
 877   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 878   jccb  (Assembler::notZero, LSuccess);
 879 
 880   // Rare inopportune interleaving - race.
 881   // The successor vanished in the small window above.
 882   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 883   // We need to ensure progress and succession.
 884   // Try to reacquire the lock.
 885   // If that fails then the new owner is responsible for succession and this
 886   // thread needs to take no further action and can exit via the fast path (success).
 887   // If the re-acquire succeeds then pass control into the slow path.
 888   // As implemented, this latter mode is horrible because we generated more
 889   // coherence traffic on the lock *and* artificially extended the critical section
 890   // length while by virtue of passing control into the slow path.
 891 
 892   // box is really RAX -- the following CMPXCHG depends on that binding
 893   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 894   lock();
 895   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 896   // There's no successor so we tried to regrab the lock.
 897   // If that didn't work, then another thread grabbed the
 898   // lock so we're done (and exit was a success).
 899   jccb  (Assembler::notEqual, LSuccess);
 900   // Intentional fall-through into slow path
 901 
 902   bind  (LGoSlowPath);
 903   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 904   jmpb  (DONE_LABEL);
 905 
 906   bind  (LSuccess);
 907   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 908   jmpb  (DONE_LABEL);
 909 
 910 #endif
 911   if (LockingMode == LM_LEGACY) {
 912     bind  (Stacked);
 913     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 914     lock();
 915     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 916     // Intentional fall-thru into DONE_LABEL
 917   }
 918 
 919   bind(DONE_LABEL);
 920 
 921   // ZFlag == 1 count in fast path
 922   // ZFlag == 0 count in slow path
 923   jccb(Assembler::notZero, NO_COUNT);
 924 
 925   bind(COUNT);
 926   // Count monitors in fast path
 927 #ifndef _LP64
 928   get_thread(tmpReg);
 929   decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 930 #else // _LP64
 931   decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 932 #endif
 933 
 934   xorl(tmpReg, tmpReg); // Set ZF == 1
 935 
 936   bind(NO_COUNT);
 937 }
 938 
 939 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 940                                               Register t, Register thread) {
 941   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 942   assert(rax_reg == rax, "Used for CAS");
 943   assert_different_registers(obj, box, rax_reg, t, thread);
 944 
 945   // Handle inflated monitor.
 946   Label inflated;
 947   // Finish fast lock successfully. ZF value is irrelevant.
 948   Label locked;
 949   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 950   Label slow_path;
 951 
 952   // Clear box. TODO[OMWorld]: Is this necessary? May also defer this to not write twice.
 953   movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 954 
 955   if (DiagnoseSyncOnValueBasedClasses != 0) {
 956     load_klass(rax_reg, obj, t);
 957     movl(rax_reg, Address(rax_reg, Klass::access_flags_offset()));
 958     testl(rax_reg, JVM_ACC_IS_VALUE_BASED_CLASS);
 959     jcc(Assembler::notZero, slow_path);
 960   }
 961 
 962   const Register mark = t;
 963 
 964   { // Lightweight Lock
 965 
 966     Label push;
 967 
 968     const Register top = rax_reg;
 969 
 970     // Load the mark.
 971     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 972 
 973     // Prefetch top.
 974     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 975 
 976     // Check for monitor (0b10).
 977     testptr(mark, markWord::monitor_value);
 978     jcc(Assembler::notZero, inflated);
 979 
 980     // Check if lock-stack is full.
 981     cmpl(top, LockStack::end_offset() - 1);
 982     jcc(Assembler::greater, slow_path);
 983 
 984     // Check if recursive.
 985     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 986     jccb(Assembler::equal, push);
 987 
 988     // Try to lock. Transition lock bits 0b01 => 0b00
 989     movptr(rax_reg, mark);
 990     orptr(rax_reg, markWord::unlocked_value);
 991     andptr(mark, ~(int32_t)markWord::unlocked_value);
 992     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 993     jcc(Assembler::notEqual, slow_path);
 994 
 995     bind(push);
 996     // After successful lock, push object on lock-stack.
 997     // TODO[OMWorld]: Was prepush better?
 998     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 999     movptr(Address(thread, top), obj);
1000     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
1001     jmpb(locked);
1002   }
1003 
1004   { // Handle inflated monitor.
1005     bind(inflated);
1006 
1007     const Register monitor = t;
1008 
1009     if (!OMUseC2Cache) {
1010       jmp(slow_path);
1011     } else {
1012       if (OMCacheHitRate) increment(Address(thread, JavaThread::lock_lookup_offset()));
1013 
1014       // Fetch ObjectMonitor* from the cache or take the slow-path.
1015       Label monitor_found;
1016 
1017       // Load cache address
1018       lea(t, Address(thread, JavaThread::om_cache_oops_offset()));
1019 
1020       const int num_unrolled = MIN2(OMC2UnrollCacheEntries, OMCacheSize);
1021       for (int i = 0; i < num_unrolled; i++) {
1022         cmpptr(obj, Address(t));
1023         jccb(Assembler::equal, monitor_found);
1024         if (i + 1 != num_unrolled) {
1025           increment(t, in_bytes(OMCache::oop_to_oop_difference()));
1026         }
1027       }
1028 
1029       if (num_unrolled == 0 || (OMC2UnrollCacheLookupLoopTail && num_unrolled != OMCacheSize)) {
1030         if (num_unrolled != 0) {
1031           // Loop after unrolling, advance iterator.
1032           increment(t, in_bytes(OMCache::oop_to_oop_difference()));
1033         }
1034 
1035         Label loop;
1036 
1037         // Search for obj in cache.
1038         bind(loop);
1039 
1040         // Check for match.
1041         cmpptr(obj, Address(t));
1042         jccb(Assembler::equal, monitor_found);
1043 
1044         // Search until null encountered, guaranteed _null_sentinel at end.
1045         cmpptr(Address(t), 1);
1046         jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0
1047         increment(t, in_bytes(OMCache::oop_to_oop_difference()));
1048         jmpb(loop);
1049       } else {
1050         jmp(slow_path);
1051       }
1052 
1053       // Cache hit.
1054       bind(monitor_found);
1055       movptr(monitor, Address(t, OMCache::oop_to_monitor_difference()));
1056       if (OMCacheHitRate) increment(Address(thread, JavaThread::lock_hit_offset()));
1057 
1058       Label monitor_locked;
1059       // Lock the monitor.
1060       Label recursion;
1061       if (OMRecursiveFastPath) {
1062         // Check owner for recursion first.
1063         cmpptr(thread, Address(monitor, ObjectMonitor::owner_offset()));
1064         jccb(Assembler::equal, recursion);
1065       }
1066 
1067       // CAS owner (null => current thread).
1068       xorptr(rax, rax);
1069       lock(); cmpxchgptr(thread, Address(monitor, ObjectMonitor::owner_offset()));
1070       jccb(Assembler::equal, monitor_locked);
1071 
1072       if (OMRecursiveFastPath) {
1073         // Recursion already checked.
1074         jmpb(slow_path);
1075       } else {
1076         // Check if recursive.
1077         cmpptr(thread, rax);
1078         jccb(Assembler::notEqual, slow_path);
1079       }
1080 
1081       // Recursive.
1082       bind(recursion);
1083       increment(Address(monitor, ObjectMonitor::recursions_offset()));
1084 
1085       bind(monitor_locked);
1086       // Cache the monitor for unlock
1087       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
1088     }
1089   }
1090 
1091   bind(locked);
1092   increment(Address(thread, JavaThread::held_monitor_count_offset()));
1093   // Set ZF = 1
1094   xorl(rax_reg, rax_reg);
1095 
1096 #ifdef ASSERT
1097   // Check that locked label is reached with ZF set.
1098   Label zf_correct;
1099   Label zf_bad_zero;
1100   jcc(Assembler::zero, zf_correct);
1101   jmp(zf_bad_zero);
1102 #endif
1103 
1104   bind(slow_path);
1105 #ifdef ASSERT
1106   // Check that slow_path label is reached with ZF not set.
1107   jcc(Assembler::notZero, zf_correct);
1108   stop("Fast Lock ZF != 0");
1109   bind(zf_bad_zero);
1110   stop("Fast Lock ZF != 1");
1111   bind(zf_correct);
1112 #endif
1113   // C2 uses the value of ZF to determine the continuation.
1114 }
1115 
1116 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
1117   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
1118   assert(reg_rax == rax, "Used for CAS");
1119   assert_different_registers(obj, reg_rax, t);
1120 
1121   // Handle inflated monitor.
1122   Label inflated, inflated_check_lock_stack;
1123   // Finish fast unlock successfully.  MUST jump with ZF == 1
1124   Label unlocked;
1125 
1126   // Assume success.
1127   decrement(Address(thread, JavaThread::held_monitor_count_offset()));
1128 
1129   const Register mark = t;
1130   const Register monitor = t;
1131   const Register top = t;
1132   const Register box = reg_rax;
1133 
1134   Label dummy;
1135   C2FastUnlockLightweightStub* stub = nullptr;
1136 
1137   if (!Compile::current()->output()->in_scratch_emit_size()) {
1138     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
1139     Compile::current()->output()->add_stub(stub);
1140   }
1141 
1142   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
1143   Label& check_successor = stub == nullptr ? dummy : stub->check_successor();
1144   Label& slow_path = stub == nullptr ? dummy : stub->slow_path();
1145 
1146   { // Lightweight Unlock
1147 
1148     // Load top.
1149     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
1150 
1151     // Check if obj is top of lock-stack.
1152     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
1153     // Top of lock stack was not obj. Must be monitor.
1154     jcc(Assembler::notEqual, inflated_check_lock_stack);
1155 
1156     // Pop lock-stack.
1157     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
1158     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
1159 
1160     // Check if recursive.
1161     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
1162     jcc(Assembler::equal, unlocked);
1163 
1164     // We elide the monitor check, let the CAS fail instead.
1165 
1166     // Load mark.
1167     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
1168 
1169     // Try to unlock. Transition lock bits 0b00 => 0b01
1170     movptr(reg_rax, mark);
1171     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
1172     orptr(mark, markWord::unlocked_value);
1173     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
1174     jcc(Assembler::notEqual, push_and_slow_path);
1175     jmp(unlocked);
1176   }
1177 
1178 
1179   { // Handle inflated monitor.
1180     bind(inflated_check_lock_stack);
1181 #ifdef ASSERT
1182     Label check_done;
1183     subl(top, oopSize);
1184     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
1185     jcc(Assembler::below, check_done);
1186     cmpptr(obj, Address(thread, top));
1187     jccb(Assembler::notEqual, inflated_check_lock_stack);
1188     stop("Fast Unlock lock on stack");
1189     bind(check_done);
1190     const Register mark = t;
1191     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
1192     testptr(mark, markWord::monitor_value);
1193     jccb(Assembler::notZero, inflated);
1194     stop("Fast Unlock not monitor");
1195 #endif
1196 
1197     bind(inflated);
1198 
1199     if (!OMUseC2Cache) {
1200       jmp(slow_path);
1201     } else {
1202       if (OMCacheHitRate) increment(Address(thread, JavaThread::unlock_lookup_offset()));
1203       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
1204       // TODO[OMWorld]: Figure out the correctness surrounding the owner field here. Obj is not on the lock stack
1205       //                but this means this thread must have locked on the inflated monitor at some point. So it
1206       //                should not be anonymous.
1207       cmpptr(monitor, 2);
1208       jcc(Assembler::below, slow_path);
1209 
1210       if (OMCacheHitRate) increment(Address(thread, JavaThread::unlock_hit_offset()));
1211 
1212       Label recursive;
1213 
1214       // Check if recursive.
1215       cmpptr(Address(monitor,ObjectMonitor::recursions_offset()), 0);
1216       jccb(Assembler::notEqual, recursive);
1217 
1218       // Check if the entry lists are empty.
1219       movptr(reg_rax, Address(monitor, ObjectMonitor::cxq_offset()));
1220       orptr(reg_rax, Address(monitor, ObjectMonitor::EntryList_offset()));
1221       jcc(Assembler::notZero, check_successor);
1222 
1223       // Release lock.
1224       movptr(Address(monitor, ObjectMonitor::owner_offset()), NULL_WORD);
1225       jmpb(unlocked);
1226 
1227       // Recursive unlock.
1228       bind(recursive);
1229       decrement(Address(monitor, ObjectMonitor::recursions_offset()));
1230       xorl(t, t);
1231     }
1232   }
1233 
1234   bind(unlocked);
1235   if (stub != nullptr) {
1236     bind(stub->unlocked_continuation());
1237   }
1238 
1239 #ifdef ASSERT
1240   // Check that unlocked label is reached with ZF set.
1241   Label zf_correct;
1242   jcc(Assembler::zero, zf_correct);
1243   stop("Fast Unlock ZF != 1");
1244 #endif
1245 
1246   if (stub != nullptr) {
1247     bind(stub->slow_path_continuation());
1248   }
1249 #ifdef ASSERT
1250   // Check that stub->continuation() label is reached with ZF not set.
1251   jccb(Assembler::notZero, zf_correct);
1252   stop("Fast Unlock ZF != 0");
1253   bind(zf_correct);
1254 #endif
1255   // C2 uses the value of ZF to determine the continuation.
1256 }
1257 
1258 //-------------------------------------------------------------------------------------------
1259 // Generic instructions support for use in .ad files C2 code generation
1260 
1261 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
1262   if (dst != src) {
1263     movdqu(dst, src);
1264   }
1265   if (opcode == Op_AbsVD) {
1266     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
1267   } else {
1268     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
1269     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1270   }
1271 }
1272 
1273 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
1274   if (opcode == Op_AbsVD) {
1275     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
1276   } else {
1277     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
1278     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
1279   }
1280 }
1281 
1282 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
1283   if (dst != src) {
1284     movdqu(dst, src);
1285   }
1286   if (opcode == Op_AbsVF) {
1287     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
1288   } else {
1289     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
1290     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1291   }
1292 }
1293 
1294 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
1295   if (opcode == Op_AbsVF) {
1296     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
1297   } else {
1298     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
1299     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
1300   }
1301 }
1302 
1303 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
1304   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1305   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
1306 
1307   if (opcode == Op_MinV) {
1308     if (elem_bt == T_BYTE) {
1309       pminsb(dst, src);
1310     } else if (elem_bt == T_SHORT) {
1311       pminsw(dst, src);
1312     } else if (elem_bt == T_INT) {
1313       pminsd(dst, src);
1314     } else {
1315       assert(elem_bt == T_LONG, "required");
1316       assert(tmp == xmm0, "required");
1317       assert_different_registers(dst, src, tmp);
1318       movdqu(xmm0, dst);
1319       pcmpgtq(xmm0, src);
1320       blendvpd(dst, src);  // xmm0 as mask
1321     }
1322   } else { // opcode == Op_MaxV
1323     if (elem_bt == T_BYTE) {
1324       pmaxsb(dst, src);
1325     } else if (elem_bt == T_SHORT) {
1326       pmaxsw(dst, src);
1327     } else if (elem_bt == T_INT) {
1328       pmaxsd(dst, src);
1329     } else {
1330       assert(elem_bt == T_LONG, "required");
1331       assert(tmp == xmm0, "required");
1332       assert_different_registers(dst, src, tmp);
1333       movdqu(xmm0, src);
1334       pcmpgtq(xmm0, dst);
1335       blendvpd(dst, src);  // xmm0 as mask
1336     }
1337   }
1338 }
1339 
1340 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1341                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1342                                  int vlen_enc) {
1343   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1344 
1345   if (opcode == Op_MinV) {
1346     if (elem_bt == T_BYTE) {
1347       vpminsb(dst, src1, src2, vlen_enc);
1348     } else if (elem_bt == T_SHORT) {
1349       vpminsw(dst, src1, src2, vlen_enc);
1350     } else if (elem_bt == T_INT) {
1351       vpminsd(dst, src1, src2, vlen_enc);
1352     } else {
1353       assert(elem_bt == T_LONG, "required");
1354       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1355         vpminsq(dst, src1, src2, vlen_enc);
1356       } else {
1357         assert_different_registers(dst, src1, src2);
1358         vpcmpgtq(dst, src1, src2, vlen_enc);
1359         vblendvpd(dst, src1, src2, dst, vlen_enc);
1360       }
1361     }
1362   } else { // opcode == Op_MaxV
1363     if (elem_bt == T_BYTE) {
1364       vpmaxsb(dst, src1, src2, vlen_enc);
1365     } else if (elem_bt == T_SHORT) {
1366       vpmaxsw(dst, src1, src2, vlen_enc);
1367     } else if (elem_bt == T_INT) {
1368       vpmaxsd(dst, src1, src2, vlen_enc);
1369     } else {
1370       assert(elem_bt == T_LONG, "required");
1371       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1372         vpmaxsq(dst, src1, src2, vlen_enc);
1373       } else {
1374         assert_different_registers(dst, src1, src2);
1375         vpcmpgtq(dst, src1, src2, vlen_enc);
1376         vblendvpd(dst, src2, src1, dst, vlen_enc);
1377       }
1378     }
1379   }
1380 }
1381 
1382 // Float/Double min max
1383 
1384 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1385                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1386                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1387                                    int vlen_enc) {
1388   assert(UseAVX > 0, "required");
1389   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1390          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1391   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1392   assert_different_registers(a, tmp, atmp, btmp);
1393   assert_different_registers(b, tmp, atmp, btmp);
1394 
1395   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1396   bool is_double_word = is_double_word_type(elem_bt);
1397 
1398   /* Note on 'non-obvious' assembly sequence:
1399    *
1400    * While there are vminps/vmaxps instructions, there are two important differences between hardware
1401    * and Java on how they handle floats:
1402    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
1403    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
1404    *
1405    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
1406    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
1407    *                (only useful when signs differ, noop otherwise)
1408    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
1409 
1410    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
1411    *   btmp = (b < +0.0) ? a : b
1412    *   atmp = (b < +0.0) ? b : a
1413    *   Tmp  = Max_Float(atmp , btmp)
1414    *   Res  = (atmp == NaN) ? atmp : Tmp
1415    */
1416 
1417   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
1418   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
1419   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
1420   XMMRegister mask;
1421 
1422   if (!is_double_word && is_min) {
1423     mask = a;
1424     vblend = &MacroAssembler::vblendvps;
1425     vmaxmin = &MacroAssembler::vminps;
1426     vcmp = &MacroAssembler::vcmpps;
1427   } else if (!is_double_word && !is_min) {
1428     mask = b;
1429     vblend = &MacroAssembler::vblendvps;
1430     vmaxmin = &MacroAssembler::vmaxps;
1431     vcmp = &MacroAssembler::vcmpps;
1432   } else if (is_double_word && is_min) {
1433     mask = a;
1434     vblend = &MacroAssembler::vblendvpd;
1435     vmaxmin = &MacroAssembler::vminpd;
1436     vcmp = &MacroAssembler::vcmppd;
1437   } else {
1438     assert(is_double_word && !is_min, "sanity");
1439     mask = b;
1440     vblend = &MacroAssembler::vblendvpd;
1441     vmaxmin = &MacroAssembler::vmaxpd;
1442     vcmp = &MacroAssembler::vcmppd;
1443   }
1444 
1445   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1446   XMMRegister maxmin, scratch;
1447   if (dst == btmp) {
1448     maxmin = btmp;
1449     scratch = tmp;
1450   } else {
1451     maxmin = tmp;
1452     scratch = btmp;
1453   }
1454 
1455   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1456   if (precompute_mask && !is_double_word) {
1457     vpsrad(tmp, mask, 32, vlen_enc);
1458     mask = tmp;
1459   } else if (precompute_mask && is_double_word) {
1460     vpxor(tmp, tmp, tmp, vlen_enc);
1461     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1462     mask = tmp;
1463   }
1464 
1465   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1466   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1467   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1468   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1469   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1470 }
1471 
1472 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1473                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1474                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1475                                     int vlen_enc) {
1476   assert(UseAVX > 2, "required");
1477   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1478          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1479   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1480   assert_different_registers(dst, a, atmp, btmp);
1481   assert_different_registers(dst, b, atmp, btmp);
1482 
1483   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1484   bool is_double_word = is_double_word_type(elem_bt);
1485   bool merge = true;
1486 
1487   if (!is_double_word && is_min) {
1488     evpmovd2m(ktmp, a, vlen_enc);
1489     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1490     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1491     vminps(dst, atmp, btmp, vlen_enc);
1492     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1493     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1494   } else if (!is_double_word && !is_min) {
1495     evpmovd2m(ktmp, b, vlen_enc);
1496     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1497     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1498     vmaxps(dst, atmp, btmp, vlen_enc);
1499     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1500     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1501   } else if (is_double_word && is_min) {
1502     evpmovq2m(ktmp, a, vlen_enc);
1503     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1504     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1505     vminpd(dst, atmp, btmp, vlen_enc);
1506     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1507     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1508   } else {
1509     assert(is_double_word && !is_min, "sanity");
1510     evpmovq2m(ktmp, b, vlen_enc);
1511     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1512     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1513     vmaxpd(dst, atmp, btmp, vlen_enc);
1514     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1515     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1516   }
1517 }
1518 
1519 // Float/Double signum
1520 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1521   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1522 
1523   Label DONE_LABEL;
1524 
1525   if (opcode == Op_SignumF) {
1526     assert(UseSSE > 0, "required");
1527     ucomiss(dst, zero);
1528     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1529     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1530     movflt(dst, one);
1531     jcc(Assembler::above, DONE_LABEL);
1532     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1533   } else if (opcode == Op_SignumD) {
1534     assert(UseSSE > 1, "required");
1535     ucomisd(dst, zero);
1536     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1537     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1538     movdbl(dst, one);
1539     jcc(Assembler::above, DONE_LABEL);
1540     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1541   }
1542 
1543   bind(DONE_LABEL);
1544 }
1545 
1546 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1547   if (sign) {
1548     pmovsxbw(dst, src);
1549   } else {
1550     pmovzxbw(dst, src);
1551   }
1552 }
1553 
1554 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1555   if (sign) {
1556     vpmovsxbw(dst, src, vector_len);
1557   } else {
1558     vpmovzxbw(dst, src, vector_len);
1559   }
1560 }
1561 
1562 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1563   if (sign) {
1564     vpmovsxbd(dst, src, vector_len);
1565   } else {
1566     vpmovzxbd(dst, src, vector_len);
1567   }
1568 }
1569 
1570 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1571   if (sign) {
1572     vpmovsxwd(dst, src, vector_len);
1573   } else {
1574     vpmovzxwd(dst, src, vector_len);
1575   }
1576 }
1577 
1578 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1579                                      int shift, int vector_len) {
1580   if (opcode == Op_RotateLeftV) {
1581     if (etype == T_INT) {
1582       evprold(dst, src, shift, vector_len);
1583     } else {
1584       assert(etype == T_LONG, "expected type T_LONG");
1585       evprolq(dst, src, shift, vector_len);
1586     }
1587   } else {
1588     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1589     if (etype == T_INT) {
1590       evprord(dst, src, shift, vector_len);
1591     } else {
1592       assert(etype == T_LONG, "expected type T_LONG");
1593       evprorq(dst, src, shift, vector_len);
1594     }
1595   }
1596 }
1597 
1598 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1599                                      XMMRegister shift, int vector_len) {
1600   if (opcode == Op_RotateLeftV) {
1601     if (etype == T_INT) {
1602       evprolvd(dst, src, shift, vector_len);
1603     } else {
1604       assert(etype == T_LONG, "expected type T_LONG");
1605       evprolvq(dst, src, shift, vector_len);
1606     }
1607   } else {
1608     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1609     if (etype == T_INT) {
1610       evprorvd(dst, src, shift, vector_len);
1611     } else {
1612       assert(etype == T_LONG, "expected type T_LONG");
1613       evprorvq(dst, src, shift, vector_len);
1614     }
1615   }
1616 }
1617 
1618 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1619   if (opcode == Op_RShiftVI) {
1620     psrad(dst, shift);
1621   } else if (opcode == Op_LShiftVI) {
1622     pslld(dst, shift);
1623   } else {
1624     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1625     psrld(dst, shift);
1626   }
1627 }
1628 
1629 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1630   switch (opcode) {
1631     case Op_RShiftVI:  psrad(dst, shift); break;
1632     case Op_LShiftVI:  pslld(dst, shift); break;
1633     case Op_URShiftVI: psrld(dst, shift); break;
1634 
1635     default: assert(false, "%s", NodeClassNames[opcode]);
1636   }
1637 }
1638 
1639 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1640   if (opcode == Op_RShiftVI) {
1641     vpsrad(dst, nds, shift, vector_len);
1642   } else if (opcode == Op_LShiftVI) {
1643     vpslld(dst, nds, shift, vector_len);
1644   } else {
1645     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1646     vpsrld(dst, nds, shift, vector_len);
1647   }
1648 }
1649 
1650 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1651   switch (opcode) {
1652     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1653     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1654     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1655 
1656     default: assert(false, "%s", NodeClassNames[opcode]);
1657   }
1658 }
1659 
1660 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1661   switch (opcode) {
1662     case Op_RShiftVB:  // fall-through
1663     case Op_RShiftVS:  psraw(dst, shift); break;
1664 
1665     case Op_LShiftVB:  // fall-through
1666     case Op_LShiftVS:  psllw(dst, shift);   break;
1667 
1668     case Op_URShiftVS: // fall-through
1669     case Op_URShiftVB: psrlw(dst, shift);  break;
1670 
1671     default: assert(false, "%s", NodeClassNames[opcode]);
1672   }
1673 }
1674 
1675 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1676   switch (opcode) {
1677     case Op_RShiftVB:  // fall-through
1678     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1679 
1680     case Op_LShiftVB:  // fall-through
1681     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1682 
1683     case Op_URShiftVS: // fall-through
1684     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1685 
1686     default: assert(false, "%s", NodeClassNames[opcode]);
1687   }
1688 }
1689 
1690 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1691   switch (opcode) {
1692     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1693     case Op_LShiftVL:  psllq(dst, shift); break;
1694     case Op_URShiftVL: psrlq(dst, shift); break;
1695 
1696     default: assert(false, "%s", NodeClassNames[opcode]);
1697   }
1698 }
1699 
1700 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1701   if (opcode == Op_RShiftVL) {
1702     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1703   } else if (opcode == Op_LShiftVL) {
1704     psllq(dst, shift);
1705   } else {
1706     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1707     psrlq(dst, shift);
1708   }
1709 }
1710 
1711 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1712   switch (opcode) {
1713     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1714     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1715     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1716 
1717     default: assert(false, "%s", NodeClassNames[opcode]);
1718   }
1719 }
1720 
1721 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1722   if (opcode == Op_RShiftVL) {
1723     evpsraq(dst, nds, shift, vector_len);
1724   } else if (opcode == Op_LShiftVL) {
1725     vpsllq(dst, nds, shift, vector_len);
1726   } else {
1727     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1728     vpsrlq(dst, nds, shift, vector_len);
1729   }
1730 }
1731 
1732 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1733   switch (opcode) {
1734     case Op_RShiftVB:  // fall-through
1735     case Op_RShiftVS:  // fall-through
1736     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1737 
1738     case Op_LShiftVB:  // fall-through
1739     case Op_LShiftVS:  // fall-through
1740     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1741 
1742     case Op_URShiftVB: // fall-through
1743     case Op_URShiftVS: // fall-through
1744     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1745 
1746     default: assert(false, "%s", NodeClassNames[opcode]);
1747   }
1748 }
1749 
1750 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1751   switch (opcode) {
1752     case Op_RShiftVB:  // fall-through
1753     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1754 
1755     case Op_LShiftVB:  // fall-through
1756     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1757 
1758     case Op_URShiftVB: // fall-through
1759     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1760 
1761     default: assert(false, "%s", NodeClassNames[opcode]);
1762   }
1763 }
1764 
1765 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1766   assert(UseAVX >= 2, "required");
1767   switch (opcode) {
1768     case Op_RShiftVL: {
1769       if (UseAVX > 2) {
1770         assert(tmp == xnoreg, "not used");
1771         if (!VM_Version::supports_avx512vl()) {
1772           vlen_enc = Assembler::AVX_512bit;
1773         }
1774         evpsravq(dst, src, shift, vlen_enc);
1775       } else {
1776         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1777         vpsrlvq(dst, src, shift, vlen_enc);
1778         vpsrlvq(tmp, tmp, shift, vlen_enc);
1779         vpxor(dst, dst, tmp, vlen_enc);
1780         vpsubq(dst, dst, tmp, vlen_enc);
1781       }
1782       break;
1783     }
1784     case Op_LShiftVL: {
1785       assert(tmp == xnoreg, "not used");
1786       vpsllvq(dst, src, shift, vlen_enc);
1787       break;
1788     }
1789     case Op_URShiftVL: {
1790       assert(tmp == xnoreg, "not used");
1791       vpsrlvq(dst, src, shift, vlen_enc);
1792       break;
1793     }
1794     default: assert(false, "%s", NodeClassNames[opcode]);
1795   }
1796 }
1797 
1798 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1799 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1800   assert(opcode == Op_LShiftVB ||
1801          opcode == Op_RShiftVB ||
1802          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1803   bool sign = (opcode != Op_URShiftVB);
1804   assert(vector_len == 0, "required");
1805   vextendbd(sign, dst, src, 1);
1806   vpmovzxbd(vtmp, shift, 1);
1807   varshiftd(opcode, dst, dst, vtmp, 1);
1808   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1809   vextracti128_high(vtmp, dst);
1810   vpackusdw(dst, dst, vtmp, 0);
1811 }
1812 
1813 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1814 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1815   assert(opcode == Op_LShiftVB ||
1816          opcode == Op_RShiftVB ||
1817          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1818   bool sign = (opcode != Op_URShiftVB);
1819   int ext_vector_len = vector_len + 1;
1820   vextendbw(sign, dst, src, ext_vector_len);
1821   vpmovzxbw(vtmp, shift, ext_vector_len);
1822   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1823   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1824   if (vector_len == 0) {
1825     vextracti128_high(vtmp, dst);
1826     vpackuswb(dst, dst, vtmp, vector_len);
1827   } else {
1828     vextracti64x4_high(vtmp, dst);
1829     vpackuswb(dst, dst, vtmp, vector_len);
1830     vpermq(dst, dst, 0xD8, vector_len);
1831   }
1832 }
1833 
1834 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1835   switch(typ) {
1836     case T_BYTE:
1837       pinsrb(dst, val, idx);
1838       break;
1839     case T_SHORT:
1840       pinsrw(dst, val, idx);
1841       break;
1842     case T_INT:
1843       pinsrd(dst, val, idx);
1844       break;
1845     case T_LONG:
1846       pinsrq(dst, val, idx);
1847       break;
1848     default:
1849       assert(false,"Should not reach here.");
1850       break;
1851   }
1852 }
1853 
1854 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1855   switch(typ) {
1856     case T_BYTE:
1857       vpinsrb(dst, src, val, idx);
1858       break;
1859     case T_SHORT:
1860       vpinsrw(dst, src, val, idx);
1861       break;
1862     case T_INT:
1863       vpinsrd(dst, src, val, idx);
1864       break;
1865     case T_LONG:
1866       vpinsrq(dst, src, val, idx);
1867       break;
1868     default:
1869       assert(false,"Should not reach here.");
1870       break;
1871   }
1872 }
1873 
1874 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1875   switch(typ) {
1876     case T_INT:
1877       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1878       break;
1879     case T_FLOAT:
1880       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1881       break;
1882     case T_LONG:
1883       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1884       break;
1885     case T_DOUBLE:
1886       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1887       break;
1888     default:
1889       assert(false,"Should not reach here.");
1890       break;
1891   }
1892 }
1893 
1894 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1895   switch(typ) {
1896     case T_INT:
1897       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1898       break;
1899     case T_FLOAT:
1900       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1901       break;
1902     case T_LONG:
1903       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1904       break;
1905     case T_DOUBLE:
1906       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1907       break;
1908     default:
1909       assert(false,"Should not reach here.");
1910       break;
1911   }
1912 }
1913 
1914 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1915   switch(typ) {
1916     case T_INT:
1917       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1918       break;
1919     case T_FLOAT:
1920       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1921       break;
1922     case T_LONG:
1923       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1924       break;
1925     case T_DOUBLE:
1926       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1927       break;
1928     default:
1929       assert(false,"Should not reach here.");
1930       break;
1931   }
1932 }
1933 
1934 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1935   if (vlen_in_bytes <= 16) {
1936     pxor (dst, dst);
1937     psubb(dst, src);
1938     switch (elem_bt) {
1939       case T_BYTE:   /* nothing to do */ break;
1940       case T_SHORT:  pmovsxbw(dst, dst); break;
1941       case T_INT:    pmovsxbd(dst, dst); break;
1942       case T_FLOAT:  pmovsxbd(dst, dst); break;
1943       case T_LONG:   pmovsxbq(dst, dst); break;
1944       case T_DOUBLE: pmovsxbq(dst, dst); break;
1945 
1946       default: assert(false, "%s", type2name(elem_bt));
1947     }
1948   } else {
1949     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1950     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1951 
1952     vpxor (dst, dst, dst, vlen_enc);
1953     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1954 
1955     switch (elem_bt) {
1956       case T_BYTE:   /* nothing to do */            break;
1957       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1958       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1959       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1960       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1961       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1962 
1963       default: assert(false, "%s", type2name(elem_bt));
1964     }
1965   }
1966 }
1967 
1968 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1969   if (novlbwdq) {
1970     vpmovsxbd(xtmp, src, vlen_enc);
1971     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1972             Assembler::eq, true, vlen_enc, noreg);
1973   } else {
1974     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1975     vpsubb(xtmp, xtmp, src, vlen_enc);
1976     evpmovb2m(dst, xtmp, vlen_enc);
1977   }
1978 }
1979 
1980 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
1981   switch (vlen_in_bytes) {
1982     case 4:  movdl(dst, src);   break;
1983     case 8:  movq(dst, src);    break;
1984     case 16: movdqu(dst, src);  break;
1985     case 32: vmovdqu(dst, src); break;
1986     case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1987     default: ShouldNotReachHere();
1988   }
1989 }
1990 
1991 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1992   assert(rscratch != noreg || always_reachable(src), "missing");
1993 
1994   if (reachable(src)) {
1995     load_vector(dst, as_Address(src), vlen_in_bytes);
1996   } else {
1997     lea(rscratch, src);
1998     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
1999   }
2000 }
2001 
2002 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
2003   int vlen_enc = vector_length_encoding(vlen);
2004   if (VM_Version::supports_avx()) {
2005     if (bt == T_LONG) {
2006       if (VM_Version::supports_avx2()) {
2007         vpbroadcastq(dst, src, vlen_enc);
2008       } else {
2009         vmovddup(dst, src, vlen_enc);
2010       }
2011     } else if (bt == T_DOUBLE) {
2012       if (vlen_enc != Assembler::AVX_128bit) {
2013         vbroadcastsd(dst, src, vlen_enc, noreg);
2014       } else {
2015         vmovddup(dst, src, vlen_enc);
2016       }
2017     } else {
2018       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
2019         vpbroadcastd(dst, src, vlen_enc);
2020       } else {
2021         vbroadcastss(dst, src, vlen_enc);
2022       }
2023     }
2024   } else if (VM_Version::supports_sse3()) {
2025     movddup(dst, src);
2026   } else {
2027     movq(dst, src);
2028     if (vlen == 16) {
2029       punpcklqdq(dst, dst);
2030     }
2031   }
2032 }
2033 
2034 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
2035   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
2036   int offset = exact_log2(type2aelembytes(bt)) << 6;
2037   if (is_floating_point_type(bt)) {
2038     offset += 128;
2039   }
2040   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
2041   load_vector(dst, addr, vlen_in_bytes);
2042 }
2043 
2044 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
2045 
2046 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
2047   int vector_len = Assembler::AVX_128bit;
2048 
2049   switch (opcode) {
2050     case Op_AndReductionV:  pand(dst, src); break;
2051     case Op_OrReductionV:   por (dst, src); break;
2052     case Op_XorReductionV:  pxor(dst, src); break;
2053     case Op_MinReductionV:
2054       switch (typ) {
2055         case T_BYTE:        pminsb(dst, src); break;
2056         case T_SHORT:       pminsw(dst, src); break;
2057         case T_INT:         pminsd(dst, src); break;
2058         case T_LONG:        assert(UseAVX > 2, "required");
2059                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
2060         default:            assert(false, "wrong type");
2061       }
2062       break;
2063     case Op_MaxReductionV:
2064       switch (typ) {
2065         case T_BYTE:        pmaxsb(dst, src); break;
2066         case T_SHORT:       pmaxsw(dst, src); break;
2067         case T_INT:         pmaxsd(dst, src); break;
2068         case T_LONG:        assert(UseAVX > 2, "required");
2069                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
2070         default:            assert(false, "wrong type");
2071       }
2072       break;
2073     case Op_AddReductionVF: addss(dst, src); break;
2074     case Op_AddReductionVD: addsd(dst, src); break;
2075     case Op_AddReductionVI:
2076       switch (typ) {
2077         case T_BYTE:        paddb(dst, src); break;
2078         case T_SHORT:       paddw(dst, src); break;
2079         case T_INT:         paddd(dst, src); break;
2080         default:            assert(false, "wrong type");
2081       }
2082       break;
2083     case Op_AddReductionVL: paddq(dst, src); break;
2084     case Op_MulReductionVF: mulss(dst, src); break;
2085     case Op_MulReductionVD: mulsd(dst, src); break;
2086     case Op_MulReductionVI:
2087       switch (typ) {
2088         case T_SHORT:       pmullw(dst, src); break;
2089         case T_INT:         pmulld(dst, src); break;
2090         default:            assert(false, "wrong type");
2091       }
2092       break;
2093     case Op_MulReductionVL: assert(UseAVX > 2, "required");
2094                             evpmullq(dst, dst, src, vector_len); break;
2095     default:                assert(false, "wrong opcode");
2096   }
2097 }
2098 
2099 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
2100   int vector_len = Assembler::AVX_256bit;
2101 
2102   switch (opcode) {
2103     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
2104     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
2105     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
2106     case Op_MinReductionV:
2107       switch (typ) {
2108         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
2109         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
2110         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
2111         case T_LONG:        assert(UseAVX > 2, "required");
2112                             vpminsq(dst, src1, src2, vector_len); break;
2113         default:            assert(false, "wrong type");
2114       }
2115       break;
2116     case Op_MaxReductionV:
2117       switch (typ) {
2118         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
2119         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
2120         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
2121         case T_LONG:        assert(UseAVX > 2, "required");
2122                             vpmaxsq(dst, src1, src2, vector_len); break;
2123         default:            assert(false, "wrong type");
2124       }
2125       break;
2126     case Op_AddReductionVI:
2127       switch (typ) {
2128         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
2129         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
2130         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
2131         default:            assert(false, "wrong type");
2132       }
2133       break;
2134     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
2135     case Op_MulReductionVI:
2136       switch (typ) {
2137         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
2138         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
2139         default:            assert(false, "wrong type");
2140       }
2141       break;
2142     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
2143     default:                assert(false, "wrong opcode");
2144   }
2145 }
2146 
2147 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
2148                                   XMMRegister dst, XMMRegister src,
2149                                   XMMRegister vtmp1, XMMRegister vtmp2) {
2150   switch (opcode) {
2151     case Op_AddReductionVF:
2152     case Op_MulReductionVF:
2153       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
2154       break;
2155 
2156     case Op_AddReductionVD:
2157     case Op_MulReductionVD:
2158       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
2159       break;
2160 
2161     default: assert(false, "wrong opcode");
2162   }
2163 }
2164 
2165 void C2_MacroAssembler::reduceB(int opcode, int vlen,
2166                              Register dst, Register src1, XMMRegister src2,
2167                              XMMRegister vtmp1, XMMRegister vtmp2) {
2168   switch (vlen) {
2169     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2170     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2171     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2172     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2173 
2174     default: assert(false, "wrong vector length");
2175   }
2176 }
2177 
2178 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
2179                              Register dst, Register src1, XMMRegister src2,
2180                              XMMRegister vtmp1, XMMRegister vtmp2) {
2181   switch (vlen) {
2182     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2183     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2184     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2185     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2186 
2187     default: assert(false, "wrong vector length");
2188   }
2189 }
2190 
2191 void C2_MacroAssembler::reduceS(int opcode, int vlen,
2192                              Register dst, Register src1, XMMRegister src2,
2193                              XMMRegister vtmp1, XMMRegister vtmp2) {
2194   switch (vlen) {
2195     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2196     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2197     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2198     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2199 
2200     default: assert(false, "wrong vector length");
2201   }
2202 }
2203 
2204 void C2_MacroAssembler::reduceI(int opcode, int vlen,
2205                              Register dst, Register src1, XMMRegister src2,
2206                              XMMRegister vtmp1, XMMRegister vtmp2) {
2207   switch (vlen) {
2208     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2209     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2210     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2211     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2212 
2213     default: assert(false, "wrong vector length");
2214   }
2215 }
2216 
2217 #ifdef _LP64
2218 void C2_MacroAssembler::reduceL(int opcode, int vlen,
2219                              Register dst, Register src1, XMMRegister src2,
2220                              XMMRegister vtmp1, XMMRegister vtmp2) {
2221   switch (vlen) {
2222     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2223     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2224     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2225 
2226     default: assert(false, "wrong vector length");
2227   }
2228 }
2229 #endif // _LP64
2230 
2231 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2232   switch (vlen) {
2233     case 2:
2234       assert(vtmp2 == xnoreg, "");
2235       reduce2F(opcode, dst, src, vtmp1);
2236       break;
2237     case 4:
2238       assert(vtmp2 == xnoreg, "");
2239       reduce4F(opcode, dst, src, vtmp1);
2240       break;
2241     case 8:
2242       reduce8F(opcode, dst, src, vtmp1, vtmp2);
2243       break;
2244     case 16:
2245       reduce16F(opcode, dst, src, vtmp1, vtmp2);
2246       break;
2247     default: assert(false, "wrong vector length");
2248   }
2249 }
2250 
2251 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2252   switch (vlen) {
2253     case 2:
2254       assert(vtmp2 == xnoreg, "");
2255       reduce2D(opcode, dst, src, vtmp1);
2256       break;
2257     case 4:
2258       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2259       break;
2260     case 8:
2261       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2262       break;
2263     default: assert(false, "wrong vector length");
2264   }
2265 }
2266 
2267 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2268   if (opcode == Op_AddReductionVI) {
2269     if (vtmp1 != src2) {
2270       movdqu(vtmp1, src2);
2271     }
2272     phaddd(vtmp1, vtmp1);
2273   } else {
2274     pshufd(vtmp1, src2, 0x1);
2275     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2276   }
2277   movdl(vtmp2, src1);
2278   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2279   movdl(dst, vtmp1);
2280 }
2281 
2282 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2283   if (opcode == Op_AddReductionVI) {
2284     if (vtmp1 != src2) {
2285       movdqu(vtmp1, src2);
2286     }
2287     phaddd(vtmp1, src2);
2288     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2289   } else {
2290     pshufd(vtmp2, src2, 0xE);
2291     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2292     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2293   }
2294 }
2295 
2296 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2297   if (opcode == Op_AddReductionVI) {
2298     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2299     vextracti128_high(vtmp2, vtmp1);
2300     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2301     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2302   } else {
2303     vextracti128_high(vtmp1, src2);
2304     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2305     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2306   }
2307 }
2308 
2309 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2310   vextracti64x4_high(vtmp2, src2);
2311   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2312   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2313 }
2314 
2315 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2316   pshufd(vtmp2, src2, 0x1);
2317   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2318   movdqu(vtmp1, vtmp2);
2319   psrldq(vtmp1, 2);
2320   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2321   movdqu(vtmp2, vtmp1);
2322   psrldq(vtmp2, 1);
2323   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2324   movdl(vtmp2, src1);
2325   pmovsxbd(vtmp1, vtmp1);
2326   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2327   pextrb(dst, vtmp1, 0x0);
2328   movsbl(dst, dst);
2329 }
2330 
2331 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2332   pshufd(vtmp1, src2, 0xE);
2333   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2334   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2335 }
2336 
2337 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2338   vextracti128_high(vtmp2, src2);
2339   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2340   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2341 }
2342 
2343 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2344   vextracti64x4_high(vtmp1, src2);
2345   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2346   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2347 }
2348 
2349 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2350   pmovsxbw(vtmp2, src2);
2351   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2352 }
2353 
2354 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2355   if (UseAVX > 1) {
2356     int vector_len = Assembler::AVX_256bit;
2357     vpmovsxbw(vtmp1, src2, vector_len);
2358     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2359   } else {
2360     pmovsxbw(vtmp2, src2);
2361     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2362     pshufd(vtmp2, src2, 0x1);
2363     pmovsxbw(vtmp2, src2);
2364     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2365   }
2366 }
2367 
2368 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2369   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2370     int vector_len = Assembler::AVX_512bit;
2371     vpmovsxbw(vtmp1, src2, vector_len);
2372     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2373   } else {
2374     assert(UseAVX >= 2,"Should not reach here.");
2375     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2376     vextracti128_high(vtmp2, src2);
2377     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2378   }
2379 }
2380 
2381 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2382   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2383   vextracti64x4_high(vtmp2, src2);
2384   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2385 }
2386 
2387 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2388   if (opcode == Op_AddReductionVI) {
2389     if (vtmp1 != src2) {
2390       movdqu(vtmp1, src2);
2391     }
2392     phaddw(vtmp1, vtmp1);
2393     phaddw(vtmp1, vtmp1);
2394   } else {
2395     pshufd(vtmp2, src2, 0x1);
2396     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2397     movdqu(vtmp1, vtmp2);
2398     psrldq(vtmp1, 2);
2399     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2400   }
2401   movdl(vtmp2, src1);
2402   pmovsxwd(vtmp1, vtmp1);
2403   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2404   pextrw(dst, vtmp1, 0x0);
2405   movswl(dst, dst);
2406 }
2407 
2408 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2409   if (opcode == Op_AddReductionVI) {
2410     if (vtmp1 != src2) {
2411       movdqu(vtmp1, src2);
2412     }
2413     phaddw(vtmp1, src2);
2414   } else {
2415     pshufd(vtmp1, src2, 0xE);
2416     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2417   }
2418   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2419 }
2420 
2421 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2422   if (opcode == Op_AddReductionVI) {
2423     int vector_len = Assembler::AVX_256bit;
2424     vphaddw(vtmp2, src2, src2, vector_len);
2425     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2426   } else {
2427     vextracti128_high(vtmp2, src2);
2428     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2429   }
2430   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2431 }
2432 
2433 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2434   int vector_len = Assembler::AVX_256bit;
2435   vextracti64x4_high(vtmp1, src2);
2436   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2437   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2438 }
2439 
2440 #ifdef _LP64
2441 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2442   pshufd(vtmp2, src2, 0xE);
2443   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2444   movdq(vtmp1, src1);
2445   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2446   movdq(dst, vtmp1);
2447 }
2448 
2449 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2450   vextracti128_high(vtmp1, src2);
2451   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2452   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2453 }
2454 
2455 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2456   vextracti64x4_high(vtmp2, src2);
2457   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2458   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2459 }
2460 
2461 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2462   mov64(temp, -1L);
2463   bzhiq(temp, temp, len);
2464   kmovql(dst, temp);
2465 }
2466 #endif // _LP64
2467 
2468 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2469   reduce_operation_128(T_FLOAT, opcode, dst, src);
2470   pshufd(vtmp, src, 0x1);
2471   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2472 }
2473 
2474 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2475   reduce2F(opcode, dst, src, vtmp);
2476   pshufd(vtmp, src, 0x2);
2477   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2478   pshufd(vtmp, src, 0x3);
2479   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2480 }
2481 
2482 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2483   reduce4F(opcode, dst, src, vtmp2);
2484   vextractf128_high(vtmp2, src);
2485   reduce4F(opcode, dst, vtmp2, vtmp1);
2486 }
2487 
2488 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2489   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2490   vextracti64x4_high(vtmp1, src);
2491   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2492 }
2493 
2494 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2495   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2496   pshufd(vtmp, src, 0xE);
2497   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2498 }
2499 
2500 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2501   reduce2D(opcode, dst, src, vtmp2);
2502   vextractf128_high(vtmp2, src);
2503   reduce2D(opcode, dst, vtmp2, vtmp1);
2504 }
2505 
2506 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2507   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2508   vextracti64x4_high(vtmp1, src);
2509   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2510 }
2511 
2512 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2513   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2514 }
2515 
2516 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2517   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2518 }
2519 
2520 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2521                                  int vec_enc) {
2522   switch(elem_bt) {
2523     case T_INT:
2524     case T_FLOAT:
2525       vmaskmovps(dst, src, mask, vec_enc);
2526       break;
2527     case T_LONG:
2528     case T_DOUBLE:
2529       vmaskmovpd(dst, src, mask, vec_enc);
2530       break;
2531     default:
2532       fatal("Unsupported type %s", type2name(elem_bt));
2533       break;
2534   }
2535 }
2536 
2537 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2538                                  int vec_enc) {
2539   switch(elem_bt) {
2540     case T_INT:
2541     case T_FLOAT:
2542       vmaskmovps(dst, src, mask, vec_enc);
2543       break;
2544     case T_LONG:
2545     case T_DOUBLE:
2546       vmaskmovpd(dst, src, mask, vec_enc);
2547       break;
2548     default:
2549       fatal("Unsupported type %s", type2name(elem_bt));
2550       break;
2551   }
2552 }
2553 
2554 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2555                                           XMMRegister dst, XMMRegister src,
2556                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2557                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2558   const int permconst[] = {1, 14};
2559   XMMRegister wsrc = src;
2560   XMMRegister wdst = xmm_0;
2561   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2562 
2563   int vlen_enc = Assembler::AVX_128bit;
2564   if (vlen == 16) {
2565     vlen_enc = Assembler::AVX_256bit;
2566   }
2567 
2568   for (int i = log2(vlen) - 1; i >=0; i--) {
2569     if (i == 0 && !is_dst_valid) {
2570       wdst = dst;
2571     }
2572     if (i == 3) {
2573       vextracti64x4_high(wtmp, wsrc);
2574     } else if (i == 2) {
2575       vextracti128_high(wtmp, wsrc);
2576     } else { // i = [0,1]
2577       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2578     }
2579     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2580     wsrc = wdst;
2581     vlen_enc = Assembler::AVX_128bit;
2582   }
2583   if (is_dst_valid) {
2584     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2585   }
2586 }
2587 
2588 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2589                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2590                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2591   XMMRegister wsrc = src;
2592   XMMRegister wdst = xmm_0;
2593   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2594   int vlen_enc = Assembler::AVX_128bit;
2595   if (vlen == 8) {
2596     vlen_enc = Assembler::AVX_256bit;
2597   }
2598   for (int i = log2(vlen) - 1; i >=0; i--) {
2599     if (i == 0 && !is_dst_valid) {
2600       wdst = dst;
2601     }
2602     if (i == 1) {
2603       vextracti128_high(wtmp, wsrc);
2604     } else if (i == 2) {
2605       vextracti64x4_high(wtmp, wsrc);
2606     } else {
2607       assert(i == 0, "%d", i);
2608       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2609     }
2610     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2611     wsrc = wdst;
2612     vlen_enc = Assembler::AVX_128bit;
2613   }
2614   if (is_dst_valid) {
2615     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2616   }
2617 }
2618 
2619 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2620   switch (bt) {
2621     case T_BYTE:  pextrb(dst, src, idx); break;
2622     case T_SHORT: pextrw(dst, src, idx); break;
2623     case T_INT:   pextrd(dst, src, idx); break;
2624     case T_LONG:  pextrq(dst, src, idx); break;
2625 
2626     default:
2627       assert(false,"Should not reach here.");
2628       break;
2629   }
2630 }
2631 
2632 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2633   int esize =  type2aelembytes(typ);
2634   int elem_per_lane = 16/esize;
2635   int lane = elemindex / elem_per_lane;
2636   int eindex = elemindex % elem_per_lane;
2637 
2638   if (lane >= 2) {
2639     assert(UseAVX > 2, "required");
2640     vextractf32x4(dst, src, lane & 3);
2641     return dst;
2642   } else if (lane > 0) {
2643     assert(UseAVX > 0, "required");
2644     vextractf128(dst, src, lane);
2645     return dst;
2646   } else {
2647     return src;
2648   }
2649 }
2650 
2651 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2652   if (typ == T_BYTE) {
2653     movsbl(dst, dst);
2654   } else if (typ == T_SHORT) {
2655     movswl(dst, dst);
2656   }
2657 }
2658 
2659 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2660   int esize =  type2aelembytes(typ);
2661   int elem_per_lane = 16/esize;
2662   int eindex = elemindex % elem_per_lane;
2663   assert(is_integral_type(typ),"required");
2664 
2665   if (eindex == 0) {
2666     if (typ == T_LONG) {
2667       movq(dst, src);
2668     } else {
2669       movdl(dst, src);
2670       movsxl(typ, dst);
2671     }
2672   } else {
2673     extract(typ, dst, src, eindex);
2674     movsxl(typ, dst);
2675   }
2676 }
2677 
2678 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2679   int esize =  type2aelembytes(typ);
2680   int elem_per_lane = 16/esize;
2681   int eindex = elemindex % elem_per_lane;
2682   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2683 
2684   if (eindex == 0) {
2685     movq(dst, src);
2686   } else {
2687     if (typ == T_FLOAT) {
2688       if (UseAVX == 0) {
2689         movdqu(dst, src);
2690         shufps(dst, dst, eindex);
2691       } else {
2692         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2693       }
2694     } else {
2695       if (UseAVX == 0) {
2696         movdqu(dst, src);
2697         psrldq(dst, eindex*esize);
2698       } else {
2699         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2700       }
2701       movq(dst, dst);
2702     }
2703   }
2704   // Zero upper bits
2705   if (typ == T_FLOAT) {
2706     if (UseAVX == 0) {
2707       assert(vtmp != xnoreg, "required.");
2708       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2709       pand(dst, vtmp);
2710     } else {
2711       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2712     }
2713   }
2714 }
2715 
2716 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2717   switch(typ) {
2718     case T_BYTE:
2719     case T_BOOLEAN:
2720       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2721       break;
2722     case T_SHORT:
2723     case T_CHAR:
2724       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2725       break;
2726     case T_INT:
2727     case T_FLOAT:
2728       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2729       break;
2730     case T_LONG:
2731     case T_DOUBLE:
2732       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2733       break;
2734     default:
2735       assert(false,"Should not reach here.");
2736       break;
2737   }
2738 }
2739 
2740 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2741   assert(rscratch != noreg || always_reachable(src2), "missing");
2742 
2743   switch(typ) {
2744     case T_BOOLEAN:
2745     case T_BYTE:
2746       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2747       break;
2748     case T_CHAR:
2749     case T_SHORT:
2750       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2751       break;
2752     case T_INT:
2753     case T_FLOAT:
2754       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2755       break;
2756     case T_LONG:
2757     case T_DOUBLE:
2758       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2759       break;
2760     default:
2761       assert(false,"Should not reach here.");
2762       break;
2763   }
2764 }
2765 
2766 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2767   switch(typ) {
2768     case T_BYTE:
2769       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2770       break;
2771     case T_SHORT:
2772       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2773       break;
2774     case T_INT:
2775     case T_FLOAT:
2776       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2777       break;
2778     case T_LONG:
2779     case T_DOUBLE:
2780       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2781       break;
2782     default:
2783       assert(false,"Should not reach here.");
2784       break;
2785   }
2786 }
2787 
2788 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2789   assert(vlen_in_bytes <= 32, "");
2790   int esize = type2aelembytes(bt);
2791   if (vlen_in_bytes == 32) {
2792     assert(vtmp == xnoreg, "required.");
2793     if (esize >= 4) {
2794       vtestps(src1, src2, AVX_256bit);
2795     } else {
2796       vptest(src1, src2, AVX_256bit);
2797     }
2798     return;
2799   }
2800   if (vlen_in_bytes < 16) {
2801     // Duplicate the lower part to fill the whole register,
2802     // Don't need to do so for src2
2803     assert(vtmp != xnoreg, "required");
2804     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2805     pshufd(vtmp, src1, shuffle_imm);
2806   } else {
2807     assert(vtmp == xnoreg, "required");
2808     vtmp = src1;
2809   }
2810   if (esize >= 4 && VM_Version::supports_avx()) {
2811     vtestps(vtmp, src2, AVX_128bit);
2812   } else {
2813     ptest(vtmp, src2);
2814   }
2815 }
2816 
2817 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2818   assert(UseAVX >= 2, "required");
2819 #ifdef ASSERT
2820   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2821   bool is_bw_supported = VM_Version::supports_avx512bw();
2822   if (is_bw && !is_bw_supported) {
2823     assert(vlen_enc != Assembler::AVX_512bit, "required");
2824     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2825            "XMM register should be 0-15");
2826   }
2827 #endif // ASSERT
2828   switch (elem_bt) {
2829     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2830     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2831     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2832     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2833     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2834     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2835     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2836   }
2837 }
2838 
2839 #ifdef _LP64
2840 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2841   assert(UseAVX >= 2, "required");
2842   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2843   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2844   if ((UseAVX > 2) &&
2845       (!is_bw || VM_Version::supports_avx512bw()) &&
2846       (!is_vl || VM_Version::supports_avx512vl())) {
2847     switch (elem_bt) {
2848       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2849       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2850       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2851       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2852       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2853     }
2854   } else {
2855     assert(vlen_enc != Assembler::AVX_512bit, "required");
2856     assert((dst->encoding() < 16),"XMM register should be 0-15");
2857     switch (elem_bt) {
2858       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2859       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2860       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2861       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2862       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2863       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2864       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2865     }
2866   }
2867 }
2868 #endif
2869 
2870 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2871   switch (to_elem_bt) {
2872     case T_SHORT:
2873       vpmovsxbw(dst, src, vlen_enc);
2874       break;
2875     case T_INT:
2876       vpmovsxbd(dst, src, vlen_enc);
2877       break;
2878     case T_FLOAT:
2879       vpmovsxbd(dst, src, vlen_enc);
2880       vcvtdq2ps(dst, dst, vlen_enc);
2881       break;
2882     case T_LONG:
2883       vpmovsxbq(dst, src, vlen_enc);
2884       break;
2885     case T_DOUBLE: {
2886       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2887       vpmovsxbd(dst, src, mid_vlen_enc);
2888       vcvtdq2pd(dst, dst, vlen_enc);
2889       break;
2890     }
2891     default:
2892       fatal("Unsupported type %s", type2name(to_elem_bt));
2893       break;
2894   }
2895 }
2896 
2897 //-------------------------------------------------------------------------------------------
2898 
2899 // IndexOf for constant substrings with size >= 8 chars
2900 // which don't need to be loaded through stack.
2901 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2902                                          Register cnt1, Register cnt2,
2903                                          int int_cnt2,  Register result,
2904                                          XMMRegister vec, Register tmp,
2905                                          int ae) {
2906   ShortBranchVerifier sbv(this);
2907   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2908   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2909 
2910   // This method uses the pcmpestri instruction with bound registers
2911   //   inputs:
2912   //     xmm - substring
2913   //     rax - substring length (elements count)
2914   //     mem - scanned string
2915   //     rdx - string length (elements count)
2916   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2917   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2918   //   outputs:
2919   //     rcx - matched index in string
2920   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2921   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2922   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2923   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2924   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2925 
2926   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2927         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2928         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2929 
2930   // Note, inline_string_indexOf() generates checks:
2931   // if (substr.count > string.count) return -1;
2932   // if (substr.count == 0) return 0;
2933   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2934 
2935   // Load substring.
2936   if (ae == StrIntrinsicNode::UL) {
2937     pmovzxbw(vec, Address(str2, 0));
2938   } else {
2939     movdqu(vec, Address(str2, 0));
2940   }
2941   movl(cnt2, int_cnt2);
2942   movptr(result, str1); // string addr
2943 
2944   if (int_cnt2 > stride) {
2945     jmpb(SCAN_TO_SUBSTR);
2946 
2947     // Reload substr for rescan, this code
2948     // is executed only for large substrings (> 8 chars)
2949     bind(RELOAD_SUBSTR);
2950     if (ae == StrIntrinsicNode::UL) {
2951       pmovzxbw(vec, Address(str2, 0));
2952     } else {
2953       movdqu(vec, Address(str2, 0));
2954     }
2955     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2956 
2957     bind(RELOAD_STR);
2958     // We came here after the beginning of the substring was
2959     // matched but the rest of it was not so we need to search
2960     // again. Start from the next element after the previous match.
2961 
2962     // cnt2 is number of substring reminding elements and
2963     // cnt1 is number of string reminding elements when cmp failed.
2964     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2965     subl(cnt1, cnt2);
2966     addl(cnt1, int_cnt2);
2967     movl(cnt2, int_cnt2); // Now restore cnt2
2968 
2969     decrementl(cnt1);     // Shift to next element
2970     cmpl(cnt1, cnt2);
2971     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2972 
2973     addptr(result, (1<<scale1));
2974 
2975   } // (int_cnt2 > 8)
2976 
2977   // Scan string for start of substr in 16-byte vectors
2978   bind(SCAN_TO_SUBSTR);
2979   pcmpestri(vec, Address(result, 0), mode);
2980   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2981   subl(cnt1, stride);
2982   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2983   cmpl(cnt1, cnt2);
2984   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2985   addptr(result, 16);
2986   jmpb(SCAN_TO_SUBSTR);
2987 
2988   // Found a potential substr
2989   bind(FOUND_CANDIDATE);
2990   // Matched whole vector if first element matched (tmp(rcx) == 0).
2991   if (int_cnt2 == stride) {
2992     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2993   } else { // int_cnt2 > 8
2994     jccb(Assembler::overflow, FOUND_SUBSTR);
2995   }
2996   // After pcmpestri tmp(rcx) contains matched element index
2997   // Compute start addr of substr
2998   lea(result, Address(result, tmp, scale1));
2999 
3000   // Make sure string is still long enough
3001   subl(cnt1, tmp);
3002   cmpl(cnt1, cnt2);
3003   if (int_cnt2 == stride) {
3004     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3005   } else { // int_cnt2 > 8
3006     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
3007   }
3008   // Left less then substring.
3009 
3010   bind(RET_NOT_FOUND);
3011   movl(result, -1);
3012   jmp(EXIT);
3013 
3014   if (int_cnt2 > stride) {
3015     // This code is optimized for the case when whole substring
3016     // is matched if its head is matched.
3017     bind(MATCH_SUBSTR_HEAD);
3018     pcmpestri(vec, Address(result, 0), mode);
3019     // Reload only string if does not match
3020     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
3021 
3022     Label CONT_SCAN_SUBSTR;
3023     // Compare the rest of substring (> 8 chars).
3024     bind(FOUND_SUBSTR);
3025     // First 8 chars are already matched.
3026     negptr(cnt2);
3027     addptr(cnt2, stride);
3028 
3029     bind(SCAN_SUBSTR);
3030     subl(cnt1, stride);
3031     cmpl(cnt2, -stride); // Do not read beyond substring
3032     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
3033     // Back-up strings to avoid reading beyond substring:
3034     // cnt1 = cnt1 - cnt2 + 8
3035     addl(cnt1, cnt2); // cnt2 is negative
3036     addl(cnt1, stride);
3037     movl(cnt2, stride); negptr(cnt2);
3038     bind(CONT_SCAN_SUBSTR);
3039     if (int_cnt2 < (int)G) {
3040       int tail_off1 = int_cnt2<<scale1;
3041       int tail_off2 = int_cnt2<<scale2;
3042       if (ae == StrIntrinsicNode::UL) {
3043         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
3044       } else {
3045         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
3046       }
3047       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
3048     } else {
3049       // calculate index in register to avoid integer overflow (int_cnt2*2)
3050       movl(tmp, int_cnt2);
3051       addptr(tmp, cnt2);
3052       if (ae == StrIntrinsicNode::UL) {
3053         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
3054       } else {
3055         movdqu(vec, Address(str2, tmp, scale2, 0));
3056       }
3057       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
3058     }
3059     // Need to reload strings pointers if not matched whole vector
3060     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3061     addptr(cnt2, stride);
3062     jcc(Assembler::negative, SCAN_SUBSTR);
3063     // Fall through if found full substring
3064 
3065   } // (int_cnt2 > 8)
3066 
3067   bind(RET_FOUND);
3068   // Found result if we matched full small substring.
3069   // Compute substr offset
3070   subptr(result, str1);
3071   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3072     shrl(result, 1); // index
3073   }
3074   bind(EXIT);
3075 
3076 } // string_indexofC8
3077 
3078 // Small strings are loaded through stack if they cross page boundary.
3079 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
3080                                        Register cnt1, Register cnt2,
3081                                        int int_cnt2,  Register result,
3082                                        XMMRegister vec, Register tmp,
3083                                        int ae) {
3084   ShortBranchVerifier sbv(this);
3085   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3086   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
3087 
3088   //
3089   // int_cnt2 is length of small (< 8 chars) constant substring
3090   // or (-1) for non constant substring in which case its length
3091   // is in cnt2 register.
3092   //
3093   // Note, inline_string_indexOf() generates checks:
3094   // if (substr.count > string.count) return -1;
3095   // if (substr.count == 0) return 0;
3096   //
3097   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
3098   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
3099   // This method uses the pcmpestri instruction with bound registers
3100   //   inputs:
3101   //     xmm - substring
3102   //     rax - substring length (elements count)
3103   //     mem - scanned string
3104   //     rdx - string length (elements count)
3105   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
3106   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
3107   //   outputs:
3108   //     rcx - matched index in string
3109   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3110   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
3111   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
3112   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
3113 
3114   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
3115         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
3116         FOUND_CANDIDATE;
3117 
3118   { //========================================================
3119     // We don't know where these strings are located
3120     // and we can't read beyond them. Load them through stack.
3121     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
3122 
3123     movptr(tmp, rsp); // save old SP
3124 
3125     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
3126       if (int_cnt2 == (1>>scale2)) { // One byte
3127         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
3128         load_unsigned_byte(result, Address(str2, 0));
3129         movdl(vec, result); // move 32 bits
3130       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
3131         // Not enough header space in 32-bit VM: 12+3 = 15.
3132         movl(result, Address(str2, -1));
3133         shrl(result, 8);
3134         movdl(vec, result); // move 32 bits
3135       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
3136         load_unsigned_short(result, Address(str2, 0));
3137         movdl(vec, result); // move 32 bits
3138       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
3139         movdl(vec, Address(str2, 0)); // move 32 bits
3140       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
3141         movq(vec, Address(str2, 0));  // move 64 bits
3142       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3143         // Array header size is 12 bytes in 32-bit VM
3144         // + 6 bytes for 3 chars == 18 bytes,
3145         // enough space to load vec and shift.
3146         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3147         if (ae == StrIntrinsicNode::UL) {
3148           int tail_off = int_cnt2-8;
3149           pmovzxbw(vec, Address(str2, tail_off));
3150           psrldq(vec, -2*tail_off);
3151         }
3152         else {
3153           int tail_off = int_cnt2*(1<<scale2);
3154           movdqu(vec, Address(str2, tail_off-16));
3155           psrldq(vec, 16-tail_off);
3156         }
3157       }
3158     } else { // not constant substring
3159       cmpl(cnt2, stride);
3160       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3161 
3162       // We can read beyond string if srt+16 does not cross page boundary
3163       // since heaps are aligned and mapped by pages.
3164       assert(os::vm_page_size() < (int)G, "default page should be small");
3165       movl(result, str2); // We need only low 32 bits
3166       andl(result, ((int)os::vm_page_size()-1));
3167       cmpl(result, ((int)os::vm_page_size()-16));
3168       jccb(Assembler::belowEqual, CHECK_STR);
3169 
3170       // Move small strings to stack to allow load 16 bytes into vec.
3171       subptr(rsp, 16);
3172       int stk_offset = wordSize-(1<<scale2);
3173       push(cnt2);
3174 
3175       bind(COPY_SUBSTR);
3176       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3177         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3178         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3179       } else if (ae == StrIntrinsicNode::UU) {
3180         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3181         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3182       }
3183       decrement(cnt2);
3184       jccb(Assembler::notZero, COPY_SUBSTR);
3185 
3186       pop(cnt2);
3187       movptr(str2, rsp);  // New substring address
3188     } // non constant
3189 
3190     bind(CHECK_STR);
3191     cmpl(cnt1, stride);
3192     jccb(Assembler::aboveEqual, BIG_STRINGS);
3193 
3194     // Check cross page boundary.
3195     movl(result, str1); // We need only low 32 bits
3196     andl(result, ((int)os::vm_page_size()-1));
3197     cmpl(result, ((int)os::vm_page_size()-16));
3198     jccb(Assembler::belowEqual, BIG_STRINGS);
3199 
3200     subptr(rsp, 16);
3201     int stk_offset = -(1<<scale1);
3202     if (int_cnt2 < 0) { // not constant
3203       push(cnt2);
3204       stk_offset += wordSize;
3205     }
3206     movl(cnt2, cnt1);
3207 
3208     bind(COPY_STR);
3209     if (ae == StrIntrinsicNode::LL) {
3210       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3211       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3212     } else {
3213       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3214       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3215     }
3216     decrement(cnt2);
3217     jccb(Assembler::notZero, COPY_STR);
3218 
3219     if (int_cnt2 < 0) { // not constant
3220       pop(cnt2);
3221     }
3222     movptr(str1, rsp);  // New string address
3223 
3224     bind(BIG_STRINGS);
3225     // Load substring.
3226     if (int_cnt2 < 0) { // -1
3227       if (ae == StrIntrinsicNode::UL) {
3228         pmovzxbw(vec, Address(str2, 0));
3229       } else {
3230         movdqu(vec, Address(str2, 0));
3231       }
3232       push(cnt2);       // substr count
3233       push(str2);       // substr addr
3234       push(str1);       // string addr
3235     } else {
3236       // Small (< 8 chars) constant substrings are loaded already.
3237       movl(cnt2, int_cnt2);
3238     }
3239     push(tmp);  // original SP
3240 
3241   } // Finished loading
3242 
3243   //========================================================
3244   // Start search
3245   //
3246 
3247   movptr(result, str1); // string addr
3248 
3249   if (int_cnt2  < 0) {  // Only for non constant substring
3250     jmpb(SCAN_TO_SUBSTR);
3251 
3252     // SP saved at sp+0
3253     // String saved at sp+1*wordSize
3254     // Substr saved at sp+2*wordSize
3255     // Substr count saved at sp+3*wordSize
3256 
3257     // Reload substr for rescan, this code
3258     // is executed only for large substrings (> 8 chars)
3259     bind(RELOAD_SUBSTR);
3260     movptr(str2, Address(rsp, 2*wordSize));
3261     movl(cnt2, Address(rsp, 3*wordSize));
3262     if (ae == StrIntrinsicNode::UL) {
3263       pmovzxbw(vec, Address(str2, 0));
3264     } else {
3265       movdqu(vec, Address(str2, 0));
3266     }
3267     // We came here after the beginning of the substring was
3268     // matched but the rest of it was not so we need to search
3269     // again. Start from the next element after the previous match.
3270     subptr(str1, result); // Restore counter
3271     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3272       shrl(str1, 1);
3273     }
3274     addl(cnt1, str1);
3275     decrementl(cnt1);   // Shift to next element
3276     cmpl(cnt1, cnt2);
3277     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3278 
3279     addptr(result, (1<<scale1));
3280   } // non constant
3281 
3282   // Scan string for start of substr in 16-byte vectors
3283   bind(SCAN_TO_SUBSTR);
3284   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3285   pcmpestri(vec, Address(result, 0), mode);
3286   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3287   subl(cnt1, stride);
3288   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3289   cmpl(cnt1, cnt2);
3290   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3291   addptr(result, 16);
3292 
3293   bind(ADJUST_STR);
3294   cmpl(cnt1, stride); // Do not read beyond string
3295   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3296   // Back-up string to avoid reading beyond string.
3297   lea(result, Address(result, cnt1, scale1, -16));
3298   movl(cnt1, stride);
3299   jmpb(SCAN_TO_SUBSTR);
3300 
3301   // Found a potential substr
3302   bind(FOUND_CANDIDATE);
3303   // After pcmpestri tmp(rcx) contains matched element index
3304 
3305   // Make sure string is still long enough
3306   subl(cnt1, tmp);
3307   cmpl(cnt1, cnt2);
3308   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3309   // Left less then substring.
3310 
3311   bind(RET_NOT_FOUND);
3312   movl(result, -1);
3313   jmp(CLEANUP);
3314 
3315   bind(FOUND_SUBSTR);
3316   // Compute start addr of substr
3317   lea(result, Address(result, tmp, scale1));
3318   if (int_cnt2 > 0) { // Constant substring
3319     // Repeat search for small substring (< 8 chars)
3320     // from new point without reloading substring.
3321     // Have to check that we don't read beyond string.
3322     cmpl(tmp, stride-int_cnt2);
3323     jccb(Assembler::greater, ADJUST_STR);
3324     // Fall through if matched whole substring.
3325   } else { // non constant
3326     assert(int_cnt2 == -1, "should be != 0");
3327 
3328     addl(tmp, cnt2);
3329     // Found result if we matched whole substring.
3330     cmpl(tmp, stride);
3331     jcc(Assembler::lessEqual, RET_FOUND);
3332 
3333     // Repeat search for small substring (<= 8 chars)
3334     // from new point 'str1' without reloading substring.
3335     cmpl(cnt2, stride);
3336     // Have to check that we don't read beyond string.
3337     jccb(Assembler::lessEqual, ADJUST_STR);
3338 
3339     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3340     // Compare the rest of substring (> 8 chars).
3341     movptr(str1, result);
3342 
3343     cmpl(tmp, cnt2);
3344     // First 8 chars are already matched.
3345     jccb(Assembler::equal, CHECK_NEXT);
3346 
3347     bind(SCAN_SUBSTR);
3348     pcmpestri(vec, Address(str1, 0), mode);
3349     // Need to reload strings pointers if not matched whole vector
3350     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3351 
3352     bind(CHECK_NEXT);
3353     subl(cnt2, stride);
3354     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3355     addptr(str1, 16);
3356     if (ae == StrIntrinsicNode::UL) {
3357       addptr(str2, 8);
3358     } else {
3359       addptr(str2, 16);
3360     }
3361     subl(cnt1, stride);
3362     cmpl(cnt2, stride); // Do not read beyond substring
3363     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3364     // Back-up strings to avoid reading beyond substring.
3365 
3366     if (ae == StrIntrinsicNode::UL) {
3367       lea(str2, Address(str2, cnt2, scale2, -8));
3368       lea(str1, Address(str1, cnt2, scale1, -16));
3369     } else {
3370       lea(str2, Address(str2, cnt2, scale2, -16));
3371       lea(str1, Address(str1, cnt2, scale1, -16));
3372     }
3373     subl(cnt1, cnt2);
3374     movl(cnt2, stride);
3375     addl(cnt1, stride);
3376     bind(CONT_SCAN_SUBSTR);
3377     if (ae == StrIntrinsicNode::UL) {
3378       pmovzxbw(vec, Address(str2, 0));
3379     } else {
3380       movdqu(vec, Address(str2, 0));
3381     }
3382     jmp(SCAN_SUBSTR);
3383 
3384     bind(RET_FOUND_LONG);
3385     movptr(str1, Address(rsp, wordSize));
3386   } // non constant
3387 
3388   bind(RET_FOUND);
3389   // Compute substr offset
3390   subptr(result, str1);
3391   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3392     shrl(result, 1); // index
3393   }
3394   bind(CLEANUP);
3395   pop(rsp); // restore SP
3396 
3397 } // string_indexof
3398 
3399 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3400                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3401   ShortBranchVerifier sbv(this);
3402   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3403 
3404   int stride = 8;
3405 
3406   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3407         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3408         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3409         FOUND_SEQ_CHAR, DONE_LABEL;
3410 
3411   movptr(result, str1);
3412   if (UseAVX >= 2) {
3413     cmpl(cnt1, stride);
3414     jcc(Assembler::less, SCAN_TO_CHAR);
3415     cmpl(cnt1, 2*stride);
3416     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3417     movdl(vec1, ch);
3418     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3419     vpxor(vec2, vec2);
3420     movl(tmp, cnt1);
3421     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3422     andl(cnt1,0x0000000F);  //tail count (in chars)
3423 
3424     bind(SCAN_TO_16_CHAR_LOOP);
3425     vmovdqu(vec3, Address(result, 0));
3426     vpcmpeqw(vec3, vec3, vec1, 1);
3427     vptest(vec2, vec3);
3428     jcc(Assembler::carryClear, FOUND_CHAR);
3429     addptr(result, 32);
3430     subl(tmp, 2*stride);
3431     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3432     jmp(SCAN_TO_8_CHAR);
3433     bind(SCAN_TO_8_CHAR_INIT);
3434     movdl(vec1, ch);
3435     pshuflw(vec1, vec1, 0x00);
3436     pshufd(vec1, vec1, 0);
3437     pxor(vec2, vec2);
3438   }
3439   bind(SCAN_TO_8_CHAR);
3440   cmpl(cnt1, stride);
3441   jcc(Assembler::less, SCAN_TO_CHAR);
3442   if (UseAVX < 2) {
3443     movdl(vec1, ch);
3444     pshuflw(vec1, vec1, 0x00);
3445     pshufd(vec1, vec1, 0);
3446     pxor(vec2, vec2);
3447   }
3448   movl(tmp, cnt1);
3449   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3450   andl(cnt1,0x00000007);  //tail count (in chars)
3451 
3452   bind(SCAN_TO_8_CHAR_LOOP);
3453   movdqu(vec3, Address(result, 0));
3454   pcmpeqw(vec3, vec1);
3455   ptest(vec2, vec3);
3456   jcc(Assembler::carryClear, FOUND_CHAR);
3457   addptr(result, 16);
3458   subl(tmp, stride);
3459   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3460   bind(SCAN_TO_CHAR);
3461   testl(cnt1, cnt1);
3462   jcc(Assembler::zero, RET_NOT_FOUND);
3463   bind(SCAN_TO_CHAR_LOOP);
3464   load_unsigned_short(tmp, Address(result, 0));
3465   cmpl(ch, tmp);
3466   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3467   addptr(result, 2);
3468   subl(cnt1, 1);
3469   jccb(Assembler::zero, RET_NOT_FOUND);
3470   jmp(SCAN_TO_CHAR_LOOP);
3471 
3472   bind(RET_NOT_FOUND);
3473   movl(result, -1);
3474   jmpb(DONE_LABEL);
3475 
3476   bind(FOUND_CHAR);
3477   if (UseAVX >= 2) {
3478     vpmovmskb(tmp, vec3);
3479   } else {
3480     pmovmskb(tmp, vec3);
3481   }
3482   bsfl(ch, tmp);
3483   addptr(result, ch);
3484 
3485   bind(FOUND_SEQ_CHAR);
3486   subptr(result, str1);
3487   shrl(result, 1);
3488 
3489   bind(DONE_LABEL);
3490 } // string_indexof_char
3491 
3492 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3493                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3494   ShortBranchVerifier sbv(this);
3495   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3496 
3497   int stride = 16;
3498 
3499   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3500         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3501         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3502         FOUND_SEQ_CHAR, DONE_LABEL;
3503 
3504   movptr(result, str1);
3505   if (UseAVX >= 2) {
3506     cmpl(cnt1, stride);
3507     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3508     cmpl(cnt1, stride*2);
3509     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3510     movdl(vec1, ch);
3511     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3512     vpxor(vec2, vec2);
3513     movl(tmp, cnt1);
3514     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3515     andl(cnt1,0x0000001F);  //tail count (in chars)
3516 
3517     bind(SCAN_TO_32_CHAR_LOOP);
3518     vmovdqu(vec3, Address(result, 0));
3519     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3520     vptest(vec2, vec3);
3521     jcc(Assembler::carryClear, FOUND_CHAR);
3522     addptr(result, 32);
3523     subl(tmp, stride*2);
3524     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3525     jmp(SCAN_TO_16_CHAR);
3526 
3527     bind(SCAN_TO_16_CHAR_INIT);
3528     movdl(vec1, ch);
3529     pxor(vec2, vec2);
3530     pshufb(vec1, vec2);
3531   }
3532 
3533   bind(SCAN_TO_16_CHAR);
3534   cmpl(cnt1, stride);
3535   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3536   if (UseAVX < 2) {
3537     movdl(vec1, ch);
3538     pxor(vec2, vec2);
3539     pshufb(vec1, vec2);
3540   }
3541   movl(tmp, cnt1);
3542   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3543   andl(cnt1,0x0000000F);  //tail count (in bytes)
3544 
3545   bind(SCAN_TO_16_CHAR_LOOP);
3546   movdqu(vec3, Address(result, 0));
3547   pcmpeqb(vec3, vec1);
3548   ptest(vec2, vec3);
3549   jcc(Assembler::carryClear, FOUND_CHAR);
3550   addptr(result, 16);
3551   subl(tmp, stride);
3552   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3553 
3554   bind(SCAN_TO_CHAR_INIT);
3555   testl(cnt1, cnt1);
3556   jcc(Assembler::zero, RET_NOT_FOUND);
3557   bind(SCAN_TO_CHAR_LOOP);
3558   load_unsigned_byte(tmp, Address(result, 0));
3559   cmpl(ch, tmp);
3560   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3561   addptr(result, 1);
3562   subl(cnt1, 1);
3563   jccb(Assembler::zero, RET_NOT_FOUND);
3564   jmp(SCAN_TO_CHAR_LOOP);
3565 
3566   bind(RET_NOT_FOUND);
3567   movl(result, -1);
3568   jmpb(DONE_LABEL);
3569 
3570   bind(FOUND_CHAR);
3571   if (UseAVX >= 2) {
3572     vpmovmskb(tmp, vec3);
3573   } else {
3574     pmovmskb(tmp, vec3);
3575   }
3576   bsfl(ch, tmp);
3577   addptr(result, ch);
3578 
3579   bind(FOUND_SEQ_CHAR);
3580   subptr(result, str1);
3581 
3582   bind(DONE_LABEL);
3583 } // stringL_indexof_char
3584 
3585 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3586   switch (eltype) {
3587   case T_BOOLEAN: return sizeof(jboolean);
3588   case T_BYTE:  return sizeof(jbyte);
3589   case T_SHORT: return sizeof(jshort);
3590   case T_CHAR:  return sizeof(jchar);
3591   case T_INT:   return sizeof(jint);
3592   default:
3593     ShouldNotReachHere();
3594     return -1;
3595   }
3596 }
3597 
3598 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3599   switch (eltype) {
3600   // T_BOOLEAN used as surrogate for unsigned byte
3601   case T_BOOLEAN: movzbl(dst, src);   break;
3602   case T_BYTE:    movsbl(dst, src);   break;
3603   case T_SHORT:   movswl(dst, src);   break;
3604   case T_CHAR:    movzwl(dst, src);   break;
3605   case T_INT:     movl(dst, src);     break;
3606   default:
3607     ShouldNotReachHere();
3608   }
3609 }
3610 
3611 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3612   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3613 }
3614 
3615 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3616   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3617 }
3618 
3619 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3620   const int vlen = Assembler::AVX_256bit;
3621   switch (eltype) {
3622   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3623   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3624   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3625   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3626   case T_INT:
3627     // do nothing
3628     break;
3629   default:
3630     ShouldNotReachHere();
3631   }
3632 }
3633 
3634 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3635                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3636                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3637                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3638                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3639                                         BasicType eltype) {
3640   ShortBranchVerifier sbv(this);
3641   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3642   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3643   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3644 
3645   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3646         SHORT_UNROLLED_LOOP_EXIT,
3647         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3648         UNROLLED_VECTOR_LOOP_BEGIN,
3649         END;
3650   switch (eltype) {
3651   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3652   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3653   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3654   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3655   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3656   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3657   }
3658 
3659   // For "renaming" for readibility of the code
3660   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3661                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3662                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3663 
3664   const int elsize = arrays_hashcode_elsize(eltype);
3665 
3666   /*
3667     if (cnt1 >= 2) {
3668       if (cnt1 >= 32) {
3669         UNROLLED VECTOR LOOP
3670       }
3671       UNROLLED SCALAR LOOP
3672     }
3673     SINGLE SCALAR
3674    */
3675 
3676   cmpl(cnt1, 32);
3677   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3678 
3679   // cnt1 >= 32 && generate_vectorized_loop
3680   xorl(index, index);
3681 
3682   // vresult = IntVector.zero(I256);
3683   for (int idx = 0; idx < 4; idx++) {
3684     vpxor(vresult[idx], vresult[idx]);
3685   }
3686   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3687   Register bound = tmp2;
3688   Register next = tmp3;
3689   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3690   movl(next, Address(tmp2, 0));
3691   movdl(vnext, next);
3692   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3693 
3694   // index = 0;
3695   // bound = cnt1 & ~(32 - 1);
3696   movl(bound, cnt1);
3697   andl(bound, ~(32 - 1));
3698   // for (; index < bound; index += 32) {
3699   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3700   // result *= next;
3701   imull(result, next);
3702   // loop fission to upfront the cost of fetching from memory, OOO execution
3703   // can then hopefully do a better job of prefetching
3704   for (int idx = 0; idx < 4; idx++) {
3705     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3706   }
3707   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3708   for (int idx = 0; idx < 4; idx++) {
3709     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3710     arrays_hashcode_elvcast(vtmp[idx], eltype);
3711     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3712   }
3713   // index += 32;
3714   addl(index, 32);
3715   // index < bound;
3716   cmpl(index, bound);
3717   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3718   // }
3719 
3720   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3721   subl(cnt1, bound);
3722   // release bound
3723 
3724   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3725   for (int idx = 0; idx < 4; idx++) {
3726     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3727     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3728     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3729   }
3730   // result += vresult.reduceLanes(ADD);
3731   for (int idx = 0; idx < 4; idx++) {
3732     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3733   }
3734 
3735   // } else if (cnt1 < 32) {
3736 
3737   bind(SHORT_UNROLLED_BEGIN);
3738   // int i = 1;
3739   movl(index, 1);
3740   cmpl(index, cnt1);
3741   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3742 
3743   // for (; i < cnt1 ; i += 2) {
3744   bind(SHORT_UNROLLED_LOOP_BEGIN);
3745   movl(tmp3, 961);
3746   imull(result, tmp3);
3747   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3748   movl(tmp3, tmp2);
3749   shll(tmp3, 5);
3750   subl(tmp3, tmp2);
3751   addl(result, tmp3);
3752   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3753   addl(result, tmp3);
3754   addl(index, 2);
3755   cmpl(index, cnt1);
3756   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3757 
3758   // }
3759   // if (i >= cnt1) {
3760   bind(SHORT_UNROLLED_LOOP_EXIT);
3761   jccb(Assembler::greater, END);
3762   movl(tmp2, result);
3763   shll(result, 5);
3764   subl(result, tmp2);
3765   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3766   addl(result, tmp3);
3767   // }
3768   bind(END);
3769 
3770   BLOCK_COMMENT("} // arrays_hashcode");
3771 
3772 } // arrays_hashcode
3773 
3774 // helper function for string_compare
3775 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3776                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3777                                            Address::ScaleFactor scale2, Register index, int ae) {
3778   if (ae == StrIntrinsicNode::LL) {
3779     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3780     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3781   } else if (ae == StrIntrinsicNode::UU) {
3782     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3783     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3784   } else {
3785     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3786     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3787   }
3788 }
3789 
3790 // Compare strings, used for char[] and byte[].
3791 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3792                                        Register cnt1, Register cnt2, Register result,
3793                                        XMMRegister vec1, int ae, KRegister mask) {
3794   ShortBranchVerifier sbv(this);
3795   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3796   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3797   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3798   int stride2x2 = 0x40;
3799   Address::ScaleFactor scale = Address::no_scale;
3800   Address::ScaleFactor scale1 = Address::no_scale;
3801   Address::ScaleFactor scale2 = Address::no_scale;
3802 
3803   if (ae != StrIntrinsicNode::LL) {
3804     stride2x2 = 0x20;
3805   }
3806 
3807   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3808     shrl(cnt2, 1);
3809   }
3810   // Compute the minimum of the string lengths and the
3811   // difference of the string lengths (stack).
3812   // Do the conditional move stuff
3813   movl(result, cnt1);
3814   subl(cnt1, cnt2);
3815   push(cnt1);
3816   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3817 
3818   // Is the minimum length zero?
3819   testl(cnt2, cnt2);
3820   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3821   if (ae == StrIntrinsicNode::LL) {
3822     // Load first bytes
3823     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3824     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3825   } else if (ae == StrIntrinsicNode::UU) {
3826     // Load first characters
3827     load_unsigned_short(result, Address(str1, 0));
3828     load_unsigned_short(cnt1, Address(str2, 0));
3829   } else {
3830     load_unsigned_byte(result, Address(str1, 0));
3831     load_unsigned_short(cnt1, Address(str2, 0));
3832   }
3833   subl(result, cnt1);
3834   jcc(Assembler::notZero,  POP_LABEL);
3835 
3836   if (ae == StrIntrinsicNode::UU) {
3837     // Divide length by 2 to get number of chars
3838     shrl(cnt2, 1);
3839   }
3840   cmpl(cnt2, 1);
3841   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3842 
3843   // Check if the strings start at the same location and setup scale and stride
3844   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3845     cmpptr(str1, str2);
3846     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3847     if (ae == StrIntrinsicNode::LL) {
3848       scale = Address::times_1;
3849       stride = 16;
3850     } else {
3851       scale = Address::times_2;
3852       stride = 8;
3853     }
3854   } else {
3855     scale1 = Address::times_1;
3856     scale2 = Address::times_2;
3857     // scale not used
3858     stride = 8;
3859   }
3860 
3861   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3862     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3863     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3864     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3865     Label COMPARE_TAIL_LONG;
3866     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3867 
3868     int pcmpmask = 0x19;
3869     if (ae == StrIntrinsicNode::LL) {
3870       pcmpmask &= ~0x01;
3871     }
3872 
3873     // Setup to compare 16-chars (32-bytes) vectors,
3874     // start from first character again because it has aligned address.
3875     if (ae == StrIntrinsicNode::LL) {
3876       stride2 = 32;
3877     } else {
3878       stride2 = 16;
3879     }
3880     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3881       adr_stride = stride << scale;
3882     } else {
3883       adr_stride1 = 8;  //stride << scale1;
3884       adr_stride2 = 16; //stride << scale2;
3885     }
3886 
3887     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3888     // rax and rdx are used by pcmpestri as elements counters
3889     movl(result, cnt2);
3890     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3891     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3892 
3893     // fast path : compare first 2 8-char vectors.
3894     bind(COMPARE_16_CHARS);
3895     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3896       movdqu(vec1, Address(str1, 0));
3897     } else {
3898       pmovzxbw(vec1, Address(str1, 0));
3899     }
3900     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3901     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3902 
3903     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3904       movdqu(vec1, Address(str1, adr_stride));
3905       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3906     } else {
3907       pmovzxbw(vec1, Address(str1, adr_stride1));
3908       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3909     }
3910     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3911     addl(cnt1, stride);
3912 
3913     // Compare the characters at index in cnt1
3914     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3915     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3916     subl(result, cnt2);
3917     jmp(POP_LABEL);
3918 
3919     // Setup the registers to start vector comparison loop
3920     bind(COMPARE_WIDE_VECTORS);
3921     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3922       lea(str1, Address(str1, result, scale));
3923       lea(str2, Address(str2, result, scale));
3924     } else {
3925       lea(str1, Address(str1, result, scale1));
3926       lea(str2, Address(str2, result, scale2));
3927     }
3928     subl(result, stride2);
3929     subl(cnt2, stride2);
3930     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3931     negptr(result);
3932 
3933     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3934     bind(COMPARE_WIDE_VECTORS_LOOP);
3935 
3936 #ifdef _LP64
3937     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3938       cmpl(cnt2, stride2x2);
3939       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3940       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3941       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3942 
3943       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3944       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3945         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3946         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3947       } else {
3948         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3949         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3950       }
3951       kortestql(mask, mask);
3952       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3953       addptr(result, stride2x2);  // update since we already compared at this addr
3954       subl(cnt2, stride2x2);      // and sub the size too
3955       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3956 
3957       vpxor(vec1, vec1);
3958       jmpb(COMPARE_WIDE_TAIL);
3959     }//if (VM_Version::supports_avx512vlbw())
3960 #endif // _LP64
3961 
3962 
3963     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3964     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3965       vmovdqu(vec1, Address(str1, result, scale));
3966       vpxor(vec1, Address(str2, result, scale));
3967     } else {
3968       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3969       vpxor(vec1, Address(str2, result, scale2));
3970     }
3971     vptest(vec1, vec1);
3972     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3973     addptr(result, stride2);
3974     subl(cnt2, stride2);
3975     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3976     // clean upper bits of YMM registers
3977     vpxor(vec1, vec1);
3978 
3979     // compare wide vectors tail
3980     bind(COMPARE_WIDE_TAIL);
3981     testptr(result, result);
3982     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3983 
3984     movl(result, stride2);
3985     movl(cnt2, result);
3986     negptr(result);
3987     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3988 
3989     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3990     bind(VECTOR_NOT_EQUAL);
3991     // clean upper bits of YMM registers
3992     vpxor(vec1, vec1);
3993     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3994       lea(str1, Address(str1, result, scale));
3995       lea(str2, Address(str2, result, scale));
3996     } else {
3997       lea(str1, Address(str1, result, scale1));
3998       lea(str2, Address(str2, result, scale2));
3999     }
4000     jmp(COMPARE_16_CHARS);
4001 
4002     // Compare tail chars, length between 1 to 15 chars
4003     bind(COMPARE_TAIL_LONG);
4004     movl(cnt2, result);
4005     cmpl(cnt2, stride);
4006     jcc(Assembler::less, COMPARE_SMALL_STR);
4007 
4008     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4009       movdqu(vec1, Address(str1, 0));
4010     } else {
4011       pmovzxbw(vec1, Address(str1, 0));
4012     }
4013     pcmpestri(vec1, Address(str2, 0), pcmpmask);
4014     jcc(Assembler::below, COMPARE_INDEX_CHAR);
4015     subptr(cnt2, stride);
4016     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
4017     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4018       lea(str1, Address(str1, result, scale));
4019       lea(str2, Address(str2, result, scale));
4020     } else {
4021       lea(str1, Address(str1, result, scale1));
4022       lea(str2, Address(str2, result, scale2));
4023     }
4024     negptr(cnt2);
4025     jmpb(WHILE_HEAD_LABEL);
4026 
4027     bind(COMPARE_SMALL_STR);
4028   } else if (UseSSE42Intrinsics) {
4029     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
4030     int pcmpmask = 0x19;
4031     // Setup to compare 8-char (16-byte) vectors,
4032     // start from first character again because it has aligned address.
4033     movl(result, cnt2);
4034     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
4035     if (ae == StrIntrinsicNode::LL) {
4036       pcmpmask &= ~0x01;
4037     }
4038     jcc(Assembler::zero, COMPARE_TAIL);
4039     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4040       lea(str1, Address(str1, result, scale));
4041       lea(str2, Address(str2, result, scale));
4042     } else {
4043       lea(str1, Address(str1, result, scale1));
4044       lea(str2, Address(str2, result, scale2));
4045     }
4046     negptr(result);
4047 
4048     // pcmpestri
4049     //   inputs:
4050     //     vec1- substring
4051     //     rax - negative string length (elements count)
4052     //     mem - scanned string
4053     //     rdx - string length (elements count)
4054     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
4055     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
4056     //   outputs:
4057     //     rcx - first mismatched element index
4058     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
4059 
4060     bind(COMPARE_WIDE_VECTORS);
4061     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4062       movdqu(vec1, Address(str1, result, scale));
4063       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
4064     } else {
4065       pmovzxbw(vec1, Address(str1, result, scale1));
4066       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
4067     }
4068     // After pcmpestri cnt1(rcx) contains mismatched element index
4069 
4070     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
4071     addptr(result, stride);
4072     subptr(cnt2, stride);
4073     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4074 
4075     // compare wide vectors tail
4076     testptr(result, result);
4077     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
4078 
4079     movl(cnt2, stride);
4080     movl(result, stride);
4081     negptr(result);
4082     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4083       movdqu(vec1, Address(str1, result, scale));
4084       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
4085     } else {
4086       pmovzxbw(vec1, Address(str1, result, scale1));
4087       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
4088     }
4089     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
4090 
4091     // Mismatched characters in the vectors
4092     bind(VECTOR_NOT_EQUAL);
4093     addptr(cnt1, result);
4094     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
4095     subl(result, cnt2);
4096     jmpb(POP_LABEL);
4097 
4098     bind(COMPARE_TAIL); // limit is zero
4099     movl(cnt2, result);
4100     // Fallthru to tail compare
4101   }
4102   // Shift str2 and str1 to the end of the arrays, negate min
4103   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4104     lea(str1, Address(str1, cnt2, scale));
4105     lea(str2, Address(str2, cnt2, scale));
4106   } else {
4107     lea(str1, Address(str1, cnt2, scale1));
4108     lea(str2, Address(str2, cnt2, scale2));
4109   }
4110   decrementl(cnt2);  // first character was compared already
4111   negptr(cnt2);
4112 
4113   // Compare the rest of the elements
4114   bind(WHILE_HEAD_LABEL);
4115   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
4116   subl(result, cnt1);
4117   jccb(Assembler::notZero, POP_LABEL);
4118   increment(cnt2);
4119   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
4120 
4121   // Strings are equal up to min length.  Return the length difference.
4122   bind(LENGTH_DIFF_LABEL);
4123   pop(result);
4124   if (ae == StrIntrinsicNode::UU) {
4125     // Divide diff by 2 to get number of chars
4126     sarl(result, 1);
4127   }
4128   jmpb(DONE_LABEL);
4129 
4130 #ifdef _LP64
4131   if (VM_Version::supports_avx512vlbw()) {
4132 
4133     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
4134 
4135     kmovql(cnt1, mask);
4136     notq(cnt1);
4137     bsfq(cnt2, cnt1);
4138     if (ae != StrIntrinsicNode::LL) {
4139       // Divide diff by 2 to get number of chars
4140       sarl(cnt2, 1);
4141     }
4142     addq(result, cnt2);
4143     if (ae == StrIntrinsicNode::LL) {
4144       load_unsigned_byte(cnt1, Address(str2, result));
4145       load_unsigned_byte(result, Address(str1, result));
4146     } else if (ae == StrIntrinsicNode::UU) {
4147       load_unsigned_short(cnt1, Address(str2, result, scale));
4148       load_unsigned_short(result, Address(str1, result, scale));
4149     } else {
4150       load_unsigned_short(cnt1, Address(str2, result, scale2));
4151       load_unsigned_byte(result, Address(str1, result, scale1));
4152     }
4153     subl(result, cnt1);
4154     jmpb(POP_LABEL);
4155   }//if (VM_Version::supports_avx512vlbw())
4156 #endif // _LP64
4157 
4158   // Discard the stored length difference
4159   bind(POP_LABEL);
4160   pop(cnt1);
4161 
4162   // That's it
4163   bind(DONE_LABEL);
4164   if(ae == StrIntrinsicNode::UL) {
4165     negl(result);
4166   }
4167 
4168 }
4169 
4170 // Search for Non-ASCII character (Negative byte value) in a byte array,
4171 // return the index of the first such character, otherwise the length
4172 // of the array segment searched.
4173 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4174 //   @IntrinsicCandidate
4175 //   public static int countPositives(byte[] ba, int off, int len) {
4176 //     for (int i = off; i < off + len; i++) {
4177 //       if (ba[i] < 0) {
4178 //         return i - off;
4179 //       }
4180 //     }
4181 //     return len;
4182 //   }
4183 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4184   Register result, Register tmp1,
4185   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4186   // rsi: byte array
4187   // rcx: len
4188   // rax: result
4189   ShortBranchVerifier sbv(this);
4190   assert_different_registers(ary1, len, result, tmp1);
4191   assert_different_registers(vec1, vec2);
4192   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4193 
4194   movl(result, len); // copy
4195   // len == 0
4196   testl(len, len);
4197   jcc(Assembler::zero, DONE);
4198 
4199   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4200     VM_Version::supports_avx512vlbw() &&
4201     VM_Version::supports_bmi2()) {
4202 
4203     Label test_64_loop, test_tail, BREAK_LOOP;
4204     movl(tmp1, len);
4205     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4206 
4207     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4208     andl(len,  0xffffffc0); // vector count (in chars)
4209     jccb(Assembler::zero, test_tail);
4210 
4211     lea(ary1, Address(ary1, len, Address::times_1));
4212     negptr(len);
4213 
4214     bind(test_64_loop);
4215     // Check whether our 64 elements of size byte contain negatives
4216     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4217     kortestql(mask1, mask1);
4218     jcc(Assembler::notZero, BREAK_LOOP);
4219 
4220     addptr(len, 64);
4221     jccb(Assembler::notZero, test_64_loop);
4222 
4223     bind(test_tail);
4224     // bail out when there is nothing to be done
4225     testl(tmp1, -1);
4226     jcc(Assembler::zero, DONE);
4227 
4228 
4229     // check the tail for absense of negatives
4230     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4231 #ifdef _LP64
4232     {
4233       Register tmp3_aliased = len;
4234       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4235       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4236       notq(tmp3_aliased);
4237       kmovql(mask2, tmp3_aliased);
4238     }
4239 #else
4240     Label k_init;
4241     jmp(k_init);
4242 
4243     // We could not read 64-bits from a general purpose register thus we move
4244     // data required to compose 64 1's to the instruction stream
4245     // We emit 64 byte wide series of elements from 0..63 which later on would
4246     // be used as a compare targets with tail count contained in tmp1 register.
4247     // Result would be a k register having tmp1 consecutive number or 1
4248     // counting from least significant bit.
4249     address tmp = pc();
4250     emit_int64(0x0706050403020100);
4251     emit_int64(0x0F0E0D0C0B0A0908);
4252     emit_int64(0x1716151413121110);
4253     emit_int64(0x1F1E1D1C1B1A1918);
4254     emit_int64(0x2726252423222120);
4255     emit_int64(0x2F2E2D2C2B2A2928);
4256     emit_int64(0x3736353433323130);
4257     emit_int64(0x3F3E3D3C3B3A3938);
4258 
4259     bind(k_init);
4260     lea(len, InternalAddress(tmp));
4261     // create mask to test for negative byte inside a vector
4262     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
4263     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
4264 
4265 #endif
4266     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4267     ktestq(mask1, mask2);
4268     jcc(Assembler::zero, DONE);
4269 
4270     // do a full check for negative registers in the tail
4271     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4272                      // ary1 already pointing to the right place
4273     jmpb(TAIL_START);
4274 
4275     bind(BREAK_LOOP);
4276     // At least one byte in the last 64 byte block was negative.
4277     // Set up to look at the last 64 bytes as if they were a tail
4278     lea(ary1, Address(ary1, len, Address::times_1));
4279     addptr(result, len);
4280     // Ignore the very last byte: if all others are positive,
4281     // it must be negative, so we can skip right to the 2+1 byte
4282     // end comparison at this point
4283     orl(result, 63);
4284     movl(len, 63);
4285     // Fallthru to tail compare
4286   } else {
4287 
4288     if (UseAVX >= 2 && UseSSE >= 2) {
4289       // With AVX2, use 32-byte vector compare
4290       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4291 
4292       // Compare 32-byte vectors
4293       testl(len, 0xffffffe0);   // vector count (in bytes)
4294       jccb(Assembler::zero, TAIL_START);
4295 
4296       andl(len, 0xffffffe0);
4297       lea(ary1, Address(ary1, len, Address::times_1));
4298       negptr(len);
4299 
4300       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4301       movdl(vec2, tmp1);
4302       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4303 
4304       bind(COMPARE_WIDE_VECTORS);
4305       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4306       vptest(vec1, vec2);
4307       jccb(Assembler::notZero, BREAK_LOOP);
4308       addptr(len, 32);
4309       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4310 
4311       testl(result, 0x0000001f);   // any bytes remaining?
4312       jcc(Assembler::zero, DONE);
4313 
4314       // Quick test using the already prepared vector mask
4315       movl(len, result);
4316       andl(len, 0x0000001f);
4317       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4318       vptest(vec1, vec2);
4319       jcc(Assembler::zero, DONE);
4320       // There are zeros, jump to the tail to determine exactly where
4321       jmpb(TAIL_START);
4322 
4323       bind(BREAK_LOOP);
4324       // At least one byte in the last 32-byte vector is negative.
4325       // Set up to look at the last 32 bytes as if they were a tail
4326       lea(ary1, Address(ary1, len, Address::times_1));
4327       addptr(result, len);
4328       // Ignore the very last byte: if all others are positive,
4329       // it must be negative, so we can skip right to the 2+1 byte
4330       // end comparison at this point
4331       orl(result, 31);
4332       movl(len, 31);
4333       // Fallthru to tail compare
4334     } else if (UseSSE42Intrinsics) {
4335       // With SSE4.2, use double quad vector compare
4336       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4337 
4338       // Compare 16-byte vectors
4339       testl(len, 0xfffffff0);   // vector count (in bytes)
4340       jcc(Assembler::zero, TAIL_START);
4341 
4342       andl(len, 0xfffffff0);
4343       lea(ary1, Address(ary1, len, Address::times_1));
4344       negptr(len);
4345 
4346       movl(tmp1, 0x80808080);
4347       movdl(vec2, tmp1);
4348       pshufd(vec2, vec2, 0);
4349 
4350       bind(COMPARE_WIDE_VECTORS);
4351       movdqu(vec1, Address(ary1, len, Address::times_1));
4352       ptest(vec1, vec2);
4353       jccb(Assembler::notZero, BREAK_LOOP);
4354       addptr(len, 16);
4355       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4356 
4357       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4358       jcc(Assembler::zero, DONE);
4359 
4360       // Quick test using the already prepared vector mask
4361       movl(len, result);
4362       andl(len, 0x0000000f);   // tail count (in bytes)
4363       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4364       ptest(vec1, vec2);
4365       jcc(Assembler::zero, DONE);
4366       jmpb(TAIL_START);
4367 
4368       bind(BREAK_LOOP);
4369       // At least one byte in the last 16-byte vector is negative.
4370       // Set up and look at the last 16 bytes as if they were a tail
4371       lea(ary1, Address(ary1, len, Address::times_1));
4372       addptr(result, len);
4373       // Ignore the very last byte: if all others are positive,
4374       // it must be negative, so we can skip right to the 2+1 byte
4375       // end comparison at this point
4376       orl(result, 15);
4377       movl(len, 15);
4378       // Fallthru to tail compare
4379     }
4380   }
4381 
4382   bind(TAIL_START);
4383   // Compare 4-byte vectors
4384   andl(len, 0xfffffffc); // vector count (in bytes)
4385   jccb(Assembler::zero, COMPARE_CHAR);
4386 
4387   lea(ary1, Address(ary1, len, Address::times_1));
4388   negptr(len);
4389 
4390   bind(COMPARE_VECTORS);
4391   movl(tmp1, Address(ary1, len, Address::times_1));
4392   andl(tmp1, 0x80808080);
4393   jccb(Assembler::notZero, TAIL_ADJUST);
4394   addptr(len, 4);
4395   jccb(Assembler::notZero, COMPARE_VECTORS);
4396 
4397   // Compare trailing char (final 2-3 bytes), if any
4398   bind(COMPARE_CHAR);
4399 
4400   testl(result, 0x2);   // tail  char
4401   jccb(Assembler::zero, COMPARE_BYTE);
4402   load_unsigned_short(tmp1, Address(ary1, 0));
4403   andl(tmp1, 0x00008080);
4404   jccb(Assembler::notZero, CHAR_ADJUST);
4405   lea(ary1, Address(ary1, 2));
4406 
4407   bind(COMPARE_BYTE);
4408   testl(result, 0x1);   // tail  byte
4409   jccb(Assembler::zero, DONE);
4410   load_unsigned_byte(tmp1, Address(ary1, 0));
4411   testl(tmp1, 0x00000080);
4412   jccb(Assembler::zero, DONE);
4413   subptr(result, 1);
4414   jmpb(DONE);
4415 
4416   bind(TAIL_ADJUST);
4417   // there are negative bits in the last 4 byte block.
4418   // Adjust result and check the next three bytes
4419   addptr(result, len);
4420   orl(result, 3);
4421   lea(ary1, Address(ary1, len, Address::times_1));
4422   jmpb(COMPARE_CHAR);
4423 
4424   bind(CHAR_ADJUST);
4425   // We are looking at a char + optional byte tail, and found that one
4426   // of the bytes in the char is negative. Adjust the result, check the
4427   // first byte and readjust if needed.
4428   andl(result, 0xfffffffc);
4429   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4430   jccb(Assembler::notZero, DONE);
4431   addptr(result, 1);
4432 
4433   // That's it
4434   bind(DONE);
4435   if (UseAVX >= 2 && UseSSE >= 2) {
4436     // clean upper bits of YMM registers
4437     vpxor(vec1, vec1);
4438     vpxor(vec2, vec2);
4439   }
4440 }
4441 
4442 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4443 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4444                                       Register limit, Register result, Register chr,
4445                                       XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
4446   ShortBranchVerifier sbv(this);
4447   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4448 
4449   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4450   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4451 
4452   if (is_array_equ) {
4453     // Check the input args
4454     cmpoop(ary1, ary2);
4455     jcc(Assembler::equal, TRUE_LABEL);
4456 
4457     // Need additional checks for arrays_equals.
4458     testptr(ary1, ary1);
4459     jcc(Assembler::zero, FALSE_LABEL);
4460     testptr(ary2, ary2);
4461     jcc(Assembler::zero, FALSE_LABEL);
4462 
4463     // Check the lengths
4464     movl(limit, Address(ary1, length_offset));
4465     cmpl(limit, Address(ary2, length_offset));
4466     jcc(Assembler::notEqual, FALSE_LABEL);
4467   }
4468 
4469   // count == 0
4470   testl(limit, limit);
4471   jcc(Assembler::zero, TRUE_LABEL);
4472 
4473   if (is_array_equ) {
4474     // Load array address
4475     lea(ary1, Address(ary1, base_offset));
4476     lea(ary2, Address(ary2, base_offset));
4477   }
4478 
4479   if (is_array_equ && is_char) {
4480     // arrays_equals when used for char[].
4481     shll(limit, 1);      // byte count != 0
4482   }
4483   movl(result, limit); // copy
4484 
4485   if (UseAVX >= 2) {
4486     // With AVX2, use 32-byte vector compare
4487     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4488 
4489     // Compare 32-byte vectors
4490     andl(result, 0x0000001f);  //   tail count (in bytes)
4491     andl(limit, 0xffffffe0);   // vector count (in bytes)
4492     jcc(Assembler::zero, COMPARE_TAIL);
4493 
4494     lea(ary1, Address(ary1, limit, Address::times_1));
4495     lea(ary2, Address(ary2, limit, Address::times_1));
4496     negptr(limit);
4497 
4498 #ifdef _LP64
4499     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4500       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4501 
4502       cmpl(limit, -64);
4503       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4504 
4505       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4506 
4507       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4508       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4509       kortestql(mask, mask);
4510       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4511       addptr(limit, 64);  // update since we already compared at this addr
4512       cmpl(limit, -64);
4513       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4514 
4515       // At this point we may still need to compare -limit+result bytes.
4516       // We could execute the next two instruction and just continue via non-wide path:
4517       //  cmpl(limit, 0);
4518       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4519       // But since we stopped at the points ary{1,2}+limit which are
4520       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4521       // (|limit| <= 32 and result < 32),
4522       // we may just compare the last 64 bytes.
4523       //
4524       addptr(result, -64);   // it is safe, bc we just came from this area
4525       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4526       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4527       kortestql(mask, mask);
4528       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4529 
4530       jmp(TRUE_LABEL);
4531 
4532       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4533 
4534     }//if (VM_Version::supports_avx512vlbw())
4535 #endif //_LP64
4536     bind(COMPARE_WIDE_VECTORS);
4537     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
4538     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4539     vpxor(vec1, vec2);
4540 
4541     vptest(vec1, vec1);
4542     jcc(Assembler::notZero, FALSE_LABEL);
4543     addptr(limit, 32);
4544     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4545 
4546     testl(result, result);
4547     jcc(Assembler::zero, TRUE_LABEL);
4548 
4549     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
4550     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4551     vpxor(vec1, vec2);
4552 
4553     vptest(vec1, vec1);
4554     jccb(Assembler::notZero, FALSE_LABEL);
4555     jmpb(TRUE_LABEL);
4556 
4557     bind(COMPARE_TAIL); // limit is zero
4558     movl(limit, result);
4559     // Fallthru to tail compare
4560   } else if (UseSSE42Intrinsics) {
4561     // With SSE4.2, use double quad vector compare
4562     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4563 
4564     // Compare 16-byte vectors
4565     andl(result, 0x0000000f);  //   tail count (in bytes)
4566     andl(limit, 0xfffffff0);   // vector count (in bytes)
4567     jcc(Assembler::zero, COMPARE_TAIL);
4568 
4569     lea(ary1, Address(ary1, limit, Address::times_1));
4570     lea(ary2, Address(ary2, limit, Address::times_1));
4571     negptr(limit);
4572 
4573     bind(COMPARE_WIDE_VECTORS);
4574     movdqu(vec1, Address(ary1, limit, Address::times_1));
4575     movdqu(vec2, Address(ary2, limit, Address::times_1));
4576     pxor(vec1, vec2);
4577 
4578     ptest(vec1, vec1);
4579     jcc(Assembler::notZero, FALSE_LABEL);
4580     addptr(limit, 16);
4581     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4582 
4583     testl(result, result);
4584     jcc(Assembler::zero, TRUE_LABEL);
4585 
4586     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4587     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4588     pxor(vec1, vec2);
4589 
4590     ptest(vec1, vec1);
4591     jccb(Assembler::notZero, FALSE_LABEL);
4592     jmpb(TRUE_LABEL);
4593 
4594     bind(COMPARE_TAIL); // limit is zero
4595     movl(limit, result);
4596     // Fallthru to tail compare
4597   }
4598 
4599   // Compare 4-byte vectors
4600   andl(limit, 0xfffffffc); // vector count (in bytes)
4601   jccb(Assembler::zero, COMPARE_CHAR);
4602 
4603   lea(ary1, Address(ary1, limit, Address::times_1));
4604   lea(ary2, Address(ary2, limit, Address::times_1));
4605   negptr(limit);
4606 
4607   bind(COMPARE_VECTORS);
4608   movl(chr, Address(ary1, limit, Address::times_1));
4609   cmpl(chr, Address(ary2, limit, Address::times_1));
4610   jccb(Assembler::notEqual, FALSE_LABEL);
4611   addptr(limit, 4);
4612   jcc(Assembler::notZero, COMPARE_VECTORS);
4613 
4614   // Compare trailing char (final 2 bytes), if any
4615   bind(COMPARE_CHAR);
4616   testl(result, 0x2);   // tail  char
4617   jccb(Assembler::zero, COMPARE_BYTE);
4618   load_unsigned_short(chr, Address(ary1, 0));
4619   load_unsigned_short(limit, Address(ary2, 0));
4620   cmpl(chr, limit);
4621   jccb(Assembler::notEqual, FALSE_LABEL);
4622 
4623   if (is_array_equ && is_char) {
4624     bind(COMPARE_BYTE);
4625   } else {
4626     lea(ary1, Address(ary1, 2));
4627     lea(ary2, Address(ary2, 2));
4628 
4629     bind(COMPARE_BYTE);
4630     testl(result, 0x1);   // tail  byte
4631     jccb(Assembler::zero, TRUE_LABEL);
4632     load_unsigned_byte(chr, Address(ary1, 0));
4633     load_unsigned_byte(limit, Address(ary2, 0));
4634     cmpl(chr, limit);
4635     jccb(Assembler::notEqual, FALSE_LABEL);
4636   }
4637   bind(TRUE_LABEL);
4638   movl(result, 1);   // return true
4639   jmpb(DONE);
4640 
4641   bind(FALSE_LABEL);
4642   xorl(result, result); // return false
4643 
4644   // That's it
4645   bind(DONE);
4646   if (UseAVX >= 2) {
4647     // clean upper bits of YMM registers
4648     vpxor(vec1, vec1);
4649     vpxor(vec2, vec2);
4650   }
4651 }
4652 
4653 #ifdef _LP64
4654 
4655 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4656 #define __ masm.
4657   Register dst = stub.data<0>();
4658   XMMRegister src = stub.data<1>();
4659   address target = stub.data<2>();
4660   __ bind(stub.entry());
4661   __ subptr(rsp, 8);
4662   __ movdbl(Address(rsp), src);
4663   __ call(RuntimeAddress(target));
4664   __ pop(dst);
4665   __ jmp(stub.continuation());
4666 #undef __
4667 }
4668 
4669 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4670   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4671   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4672 
4673   address slowpath_target;
4674   if (dst_bt == T_INT) {
4675     if (src_bt == T_FLOAT) {
4676       cvttss2sil(dst, src);
4677       cmpl(dst, 0x80000000);
4678       slowpath_target = StubRoutines::x86::f2i_fixup();
4679     } else {
4680       cvttsd2sil(dst, src);
4681       cmpl(dst, 0x80000000);
4682       slowpath_target = StubRoutines::x86::d2i_fixup();
4683     }
4684   } else {
4685     if (src_bt == T_FLOAT) {
4686       cvttss2siq(dst, src);
4687       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4688       slowpath_target = StubRoutines::x86::f2l_fixup();
4689     } else {
4690       cvttsd2siq(dst, src);
4691       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4692       slowpath_target = StubRoutines::x86::d2l_fixup();
4693     }
4694   }
4695 
4696   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4697   jcc(Assembler::equal, stub->entry());
4698   bind(stub->continuation());
4699 }
4700 
4701 #endif // _LP64
4702 
4703 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4704                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4705   switch(ideal_opc) {
4706     case Op_LShiftVS:
4707       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4708     case Op_LShiftVI:
4709       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4710     case Op_LShiftVL:
4711       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4712     case Op_RShiftVS:
4713       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4714     case Op_RShiftVI:
4715       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4716     case Op_RShiftVL:
4717       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4718     case Op_URShiftVS:
4719       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4720     case Op_URShiftVI:
4721       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4722     case Op_URShiftVL:
4723       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4724     case Op_RotateRightV:
4725       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4726     case Op_RotateLeftV:
4727       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4728     default:
4729       fatal("Unsupported masked operation"); break;
4730   }
4731 }
4732 
4733 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4734                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4735                                     bool is_varshift) {
4736   switch (ideal_opc) {
4737     case Op_AddVB:
4738       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4739     case Op_AddVS:
4740       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4741     case Op_AddVI:
4742       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4743     case Op_AddVL:
4744       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4745     case Op_AddVF:
4746       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4747     case Op_AddVD:
4748       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4749     case Op_SubVB:
4750       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4751     case Op_SubVS:
4752       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4753     case Op_SubVI:
4754       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4755     case Op_SubVL:
4756       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4757     case Op_SubVF:
4758       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4759     case Op_SubVD:
4760       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4761     case Op_MulVS:
4762       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4763     case Op_MulVI:
4764       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4765     case Op_MulVL:
4766       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4767     case Op_MulVF:
4768       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4769     case Op_MulVD:
4770       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4771     case Op_DivVF:
4772       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4773     case Op_DivVD:
4774       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4775     case Op_SqrtVF:
4776       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4777     case Op_SqrtVD:
4778       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4779     case Op_AbsVB:
4780       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4781     case Op_AbsVS:
4782       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4783     case Op_AbsVI:
4784       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4785     case Op_AbsVL:
4786       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4787     case Op_FmaVF:
4788       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4789     case Op_FmaVD:
4790       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4791     case Op_VectorRearrange:
4792       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4793     case Op_LShiftVS:
4794       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4795     case Op_LShiftVI:
4796       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4797     case Op_LShiftVL:
4798       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4799     case Op_RShiftVS:
4800       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4801     case Op_RShiftVI:
4802       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4803     case Op_RShiftVL:
4804       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4805     case Op_URShiftVS:
4806       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4807     case Op_URShiftVI:
4808       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4809     case Op_URShiftVL:
4810       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4811     case Op_RotateLeftV:
4812       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4813     case Op_RotateRightV:
4814       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4815     case Op_MaxV:
4816       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4817     case Op_MinV:
4818       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4819     case Op_XorV:
4820       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4821     case Op_OrV:
4822       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4823     case Op_AndV:
4824       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4825     default:
4826       fatal("Unsupported masked operation"); break;
4827   }
4828 }
4829 
4830 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4831                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4832   switch (ideal_opc) {
4833     case Op_AddVB:
4834       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4835     case Op_AddVS:
4836       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4837     case Op_AddVI:
4838       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4839     case Op_AddVL:
4840       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4841     case Op_AddVF:
4842       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4843     case Op_AddVD:
4844       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4845     case Op_SubVB:
4846       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4847     case Op_SubVS:
4848       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4849     case Op_SubVI:
4850       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4851     case Op_SubVL:
4852       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4853     case Op_SubVF:
4854       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4855     case Op_SubVD:
4856       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4857     case Op_MulVS:
4858       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4859     case Op_MulVI:
4860       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4861     case Op_MulVL:
4862       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4863     case Op_MulVF:
4864       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4865     case Op_MulVD:
4866       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4867     case Op_DivVF:
4868       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4869     case Op_DivVD:
4870       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4871     case Op_FmaVF:
4872       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4873     case Op_FmaVD:
4874       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4875     case Op_MaxV:
4876       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4877     case Op_MinV:
4878       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4879     case Op_XorV:
4880       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4881     case Op_OrV:
4882       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4883     case Op_AndV:
4884       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4885     default:
4886       fatal("Unsupported masked operation"); break;
4887   }
4888 }
4889 
4890 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4891                                   KRegister src1, KRegister src2) {
4892   BasicType etype = T_ILLEGAL;
4893   switch(mask_len) {
4894     case 2:
4895     case 4:
4896     case 8:  etype = T_BYTE; break;
4897     case 16: etype = T_SHORT; break;
4898     case 32: etype = T_INT; break;
4899     case 64: etype = T_LONG; break;
4900     default: fatal("Unsupported type"); break;
4901   }
4902   assert(etype != T_ILLEGAL, "");
4903   switch(ideal_opc) {
4904     case Op_AndVMask:
4905       kand(etype, dst, src1, src2); break;
4906     case Op_OrVMask:
4907       kor(etype, dst, src1, src2); break;
4908     case Op_XorVMask:
4909       kxor(etype, dst, src1, src2); break;
4910     default:
4911       fatal("Unsupported masked operation"); break;
4912   }
4913 }
4914 
4915 /*
4916  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4917  * If src is NaN, the result is 0.
4918  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4919  * the result is equal to the value of Integer.MIN_VALUE.
4920  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4921  * the result is equal to the value of Integer.MAX_VALUE.
4922  */
4923 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4924                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4925                                                                    Register rscratch, AddressLiteral float_sign_flip,
4926                                                                    int vec_enc) {
4927   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4928   Label done;
4929   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4930   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4931   vptest(xtmp2, xtmp2, vec_enc);
4932   jccb(Assembler::equal, done);
4933 
4934   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4935   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4936 
4937   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4938   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4939   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4940 
4941   // Recompute the mask for remaining special value.
4942   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4943   // Extract SRC values corresponding to TRUE mask lanes.
4944   vpand(xtmp4, xtmp2, src, vec_enc);
4945   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4946   // values are set.
4947   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4948 
4949   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4950   bind(done);
4951 }
4952 
4953 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4954                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4955                                                                     Register rscratch, AddressLiteral float_sign_flip,
4956                                                                     int vec_enc) {
4957   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4958   Label done;
4959   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4960   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4961   kortestwl(ktmp1, ktmp1);
4962   jccb(Assembler::equal, done);
4963 
4964   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4965   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4966   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4967 
4968   kxorwl(ktmp1, ktmp1, ktmp2);
4969   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4970   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4971   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4972   bind(done);
4973 }
4974 
4975 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4976                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4977                                                                      Register rscratch, AddressLiteral double_sign_flip,
4978                                                                      int vec_enc) {
4979   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4980 
4981   Label done;
4982   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4983   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4984   kortestwl(ktmp1, ktmp1);
4985   jccb(Assembler::equal, done);
4986 
4987   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4988   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4989   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4990 
4991   kxorwl(ktmp1, ktmp1, ktmp2);
4992   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4993   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4994   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4995   bind(done);
4996 }
4997 
4998 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4999                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5000                                                                      Register rscratch, AddressLiteral float_sign_flip,
5001                                                                      int vec_enc) {
5002   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5003   Label done;
5004   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5005   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5006   kortestwl(ktmp1, ktmp1);
5007   jccb(Assembler::equal, done);
5008 
5009   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5010   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5011   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5012 
5013   kxorwl(ktmp1, ktmp1, ktmp2);
5014   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5015   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5016   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5017   bind(done);
5018 }
5019 
5020 /*
5021  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5022  * If src is NaN, the result is 0.
5023  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5024  * the result is equal to the value of Long.MIN_VALUE.
5025  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5026  * the result is equal to the value of Long.MAX_VALUE.
5027  */
5028 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5029                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5030                                                                       Register rscratch, AddressLiteral double_sign_flip,
5031                                                                       int vec_enc) {
5032   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5033 
5034   Label done;
5035   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5036   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5037   kortestwl(ktmp1, ktmp1);
5038   jccb(Assembler::equal, done);
5039 
5040   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5041   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5042   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5043 
5044   kxorwl(ktmp1, ktmp1, ktmp2);
5045   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5046   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5047   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5048   bind(done);
5049 }
5050 
5051 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5052                                                              XMMRegister xtmp, int index, int vec_enc) {
5053    assert(vec_enc < Assembler::AVX_512bit, "");
5054    if (vec_enc == Assembler::AVX_256bit) {
5055      vextractf128_high(xtmp, src);
5056      vshufps(dst, src, xtmp, index, vec_enc);
5057    } else {
5058      vshufps(dst, src, zero, index, vec_enc);
5059    }
5060 }
5061 
5062 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5063                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5064                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5065   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5066 
5067   Label done;
5068   // Compare the destination lanes with float_sign_flip
5069   // value to get mask for all special values.
5070   movdqu(xtmp1, float_sign_flip, rscratch);
5071   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5072   ptest(xtmp2, xtmp2);
5073   jccb(Assembler::equal, done);
5074 
5075   // Flip float_sign_flip to get max integer value.
5076   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5077   pxor(xtmp1, xtmp4);
5078 
5079   // Set detination lanes corresponding to unordered source lanes as zero.
5080   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5081   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5082 
5083   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5084   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5085   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5086 
5087   // Recompute the mask for remaining special value.
5088   pxor(xtmp2, xtmp3);
5089   // Extract mask corresponding to non-negative source lanes.
5090   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5091 
5092   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5093   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5094   pand(xtmp3, xtmp2);
5095 
5096   // Replace destination lanes holding special value(0x80000000) with max int
5097   // if corresponding source lane holds a +ve value.
5098   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5099   bind(done);
5100 }
5101 
5102 
5103 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5104                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5105   switch(to_elem_bt) {
5106     case T_SHORT:
5107       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5108       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5109       vpackusdw(dst, dst, zero, vec_enc);
5110       if (vec_enc == Assembler::AVX_256bit) {
5111         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5112       }
5113       break;
5114     case  T_BYTE:
5115       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5116       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5117       vpackusdw(dst, dst, zero, vec_enc);
5118       if (vec_enc == Assembler::AVX_256bit) {
5119         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5120       }
5121       vpackuswb(dst, dst, zero, vec_enc);
5122       break;
5123     default: assert(false, "%s", type2name(to_elem_bt));
5124   }
5125 }
5126 
5127 /*
5128  * Algorithm for vector D2L and F2I conversions:-
5129  * a) Perform vector D2L/F2I cast.
5130  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5131  *    It signifies that source value could be any of the special floating point
5132  *    values(NaN,-Inf,Inf,Max,-Min).
5133  * c) Set destination to zero if source is NaN value.
5134  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5135  */
5136 
5137 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5138                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5139                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5140   int to_elem_sz = type2aelembytes(to_elem_bt);
5141   assert(to_elem_sz <= 4, "");
5142   vcvttps2dq(dst, src, vec_enc);
5143   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5144   if (to_elem_sz < 4) {
5145     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5146     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5147   }
5148 }
5149 
5150 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5151                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5152                                             Register rscratch, int vec_enc) {
5153   int to_elem_sz = type2aelembytes(to_elem_bt);
5154   assert(to_elem_sz <= 4, "");
5155   vcvttps2dq(dst, src, vec_enc);
5156   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5157   switch(to_elem_bt) {
5158     case T_INT:
5159       break;
5160     case T_SHORT:
5161       evpmovdw(dst, dst, vec_enc);
5162       break;
5163     case T_BYTE:
5164       evpmovdb(dst, dst, vec_enc);
5165       break;
5166     default: assert(false, "%s", type2name(to_elem_bt));
5167   }
5168 }
5169 
5170 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5171                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5172                                             Register rscratch, int vec_enc) {
5173   evcvttps2qq(dst, src, vec_enc);
5174   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5175 }
5176 
5177 // Handling for downcasting from double to integer or sub-word types on AVX2.
5178 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5179                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5180                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5181   int to_elem_sz = type2aelembytes(to_elem_bt);
5182   assert(to_elem_sz < 8, "");
5183   vcvttpd2dq(dst, src, vec_enc);
5184   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5185                                               float_sign_flip, vec_enc);
5186   if (to_elem_sz < 4) {
5187     // xtmp4 holds all zero lanes.
5188     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5189   }
5190 }
5191 
5192 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5193                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5194                                             KRegister ktmp2, AddressLiteral sign_flip,
5195                                             Register rscratch, int vec_enc) {
5196   if (VM_Version::supports_avx512dq()) {
5197     evcvttpd2qq(dst, src, vec_enc);
5198     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5199     switch(to_elem_bt) {
5200       case T_LONG:
5201         break;
5202       case T_INT:
5203         evpmovsqd(dst, dst, vec_enc);
5204         break;
5205       case T_SHORT:
5206         evpmovsqd(dst, dst, vec_enc);
5207         evpmovdw(dst, dst, vec_enc);
5208         break;
5209       case T_BYTE:
5210         evpmovsqd(dst, dst, vec_enc);
5211         evpmovdb(dst, dst, vec_enc);
5212         break;
5213       default: assert(false, "%s", type2name(to_elem_bt));
5214     }
5215   } else {
5216     assert(type2aelembytes(to_elem_bt) <= 4, "");
5217     vcvttpd2dq(dst, src, vec_enc);
5218     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5219     switch(to_elem_bt) {
5220       case T_INT:
5221         break;
5222       case T_SHORT:
5223         evpmovdw(dst, dst, vec_enc);
5224         break;
5225       case T_BYTE:
5226         evpmovdb(dst, dst, vec_enc);
5227         break;
5228       default: assert(false, "%s", type2name(to_elem_bt));
5229     }
5230   }
5231 }
5232 
5233 #ifdef _LP64
5234 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5235                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5236                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5237   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5238   // and re-instantiate original MXCSR.RC mode after that.
5239   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5240 
5241   mov64(tmp, julong_cast(0.5L));
5242   evpbroadcastq(xtmp1, tmp, vec_enc);
5243   vaddpd(xtmp1, src , xtmp1, vec_enc);
5244   evcvtpd2qq(dst, xtmp1, vec_enc);
5245   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5246                                                 double_sign_flip, vec_enc);;
5247 
5248   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5249 }
5250 
5251 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5252                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5253                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5254   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5255   // and re-instantiate original MXCSR.RC mode after that.
5256   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5257 
5258   movl(tmp, jint_cast(0.5));
5259   movq(xtmp1, tmp);
5260   vbroadcastss(xtmp1, xtmp1, vec_enc);
5261   vaddps(xtmp1, src , xtmp1, vec_enc);
5262   vcvtps2dq(dst, xtmp1, vec_enc);
5263   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5264                                               float_sign_flip, vec_enc);
5265 
5266   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5267 }
5268 
5269 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5270                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5271                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5272   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5273   // and re-instantiate original MXCSR.RC mode after that.
5274   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5275 
5276   movl(tmp, jint_cast(0.5));
5277   movq(xtmp1, tmp);
5278   vbroadcastss(xtmp1, xtmp1, vec_enc);
5279   vaddps(xtmp1, src , xtmp1, vec_enc);
5280   vcvtps2dq(dst, xtmp1, vec_enc);
5281   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5282 
5283   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5284 }
5285 #endif // _LP64
5286 
5287 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5288                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5289   switch (from_elem_bt) {
5290     case T_BYTE:
5291       switch (to_elem_bt) {
5292         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5293         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5294         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5295         default: ShouldNotReachHere();
5296       }
5297       break;
5298     case T_SHORT:
5299       switch (to_elem_bt) {
5300         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5301         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5302         default: ShouldNotReachHere();
5303       }
5304       break;
5305     case T_INT:
5306       assert(to_elem_bt == T_LONG, "");
5307       vpmovzxdq(dst, src, vlen_enc);
5308       break;
5309     default:
5310       ShouldNotReachHere();
5311   }
5312 }
5313 
5314 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5315                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5316   switch (from_elem_bt) {
5317     case T_BYTE:
5318       switch (to_elem_bt) {
5319         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5320         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5321         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5322         default: ShouldNotReachHere();
5323       }
5324       break;
5325     case T_SHORT:
5326       switch (to_elem_bt) {
5327         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5328         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5329         default: ShouldNotReachHere();
5330       }
5331       break;
5332     case T_INT:
5333       assert(to_elem_bt == T_LONG, "");
5334       vpmovsxdq(dst, src, vlen_enc);
5335       break;
5336     default:
5337       ShouldNotReachHere();
5338   }
5339 }
5340 
5341 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5342                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5343   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5344   assert(vlen_enc != AVX_512bit, "");
5345 
5346   int dst_bt_size = type2aelembytes(dst_bt);
5347   int src_bt_size = type2aelembytes(src_bt);
5348   if (dst_bt_size > src_bt_size) {
5349     switch (dst_bt_size / src_bt_size) {
5350       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5351       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5352       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5353       default: ShouldNotReachHere();
5354     }
5355   } else {
5356     assert(dst_bt_size < src_bt_size, "");
5357     switch (src_bt_size / dst_bt_size) {
5358       case 2: {
5359         if (vlen_enc == AVX_128bit) {
5360           vpacksswb(dst, src, src, vlen_enc);
5361         } else {
5362           vpacksswb(dst, src, src, vlen_enc);
5363           vpermq(dst, dst, 0x08, vlen_enc);
5364         }
5365         break;
5366       }
5367       case 4: {
5368         if (vlen_enc == AVX_128bit) {
5369           vpackssdw(dst, src, src, vlen_enc);
5370           vpacksswb(dst, dst, dst, vlen_enc);
5371         } else {
5372           vpackssdw(dst, src, src, vlen_enc);
5373           vpermq(dst, dst, 0x08, vlen_enc);
5374           vpacksswb(dst, dst, dst, AVX_128bit);
5375         }
5376         break;
5377       }
5378       case 8: {
5379         if (vlen_enc == AVX_128bit) {
5380           vpshufd(dst, src, 0x08, vlen_enc);
5381           vpackssdw(dst, dst, dst, vlen_enc);
5382           vpacksswb(dst, dst, dst, vlen_enc);
5383         } else {
5384           vpshufd(dst, src, 0x08, vlen_enc);
5385           vpermq(dst, dst, 0x08, vlen_enc);
5386           vpackssdw(dst, dst, dst, AVX_128bit);
5387           vpacksswb(dst, dst, dst, AVX_128bit);
5388         }
5389         break;
5390       }
5391       default: ShouldNotReachHere();
5392     }
5393   }
5394 }
5395 
5396 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5397                                    bool merge, BasicType bt, int vlen_enc) {
5398   if (bt == T_INT) {
5399     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5400   } else {
5401     assert(bt == T_LONG, "");
5402     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5403   }
5404 }
5405 
5406 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5407                                    bool merge, BasicType bt, int vlen_enc) {
5408   if (bt == T_INT) {
5409     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5410   } else {
5411     assert(bt == T_LONG, "");
5412     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5413   }
5414 }
5415 
5416 #ifdef _LP64
5417 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5418                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5419                                                int vec_enc) {
5420   int index = 0;
5421   int vindex = 0;
5422   mov64(rtmp1, 0x0101010101010101L);
5423   pdepq(rtmp1, src, rtmp1);
5424   if (mask_len > 8) {
5425     movq(rtmp2, src);
5426     vpxor(xtmp, xtmp, xtmp, vec_enc);
5427     movq(xtmp, rtmp1);
5428   }
5429   movq(dst, rtmp1);
5430 
5431   mask_len -= 8;
5432   while (mask_len > 0) {
5433     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5434     index++;
5435     if ((index % 2) == 0) {
5436       pxor(xtmp, xtmp);
5437     }
5438     mov64(rtmp1, 0x0101010101010101L);
5439     shrq(rtmp2, 8);
5440     pdepq(rtmp1, rtmp2, rtmp1);
5441     pinsrq(xtmp, rtmp1, index % 2);
5442     vindex = index / 2;
5443     if (vindex) {
5444       // Write entire 16 byte vector when both 64 bit
5445       // lanes are update to save redundant instructions.
5446       if (index % 2) {
5447         vinsertf128(dst, dst, xtmp, vindex);
5448       }
5449     } else {
5450       vmovdqu(dst, xtmp);
5451     }
5452     mask_len -= 8;
5453   }
5454 }
5455 
5456 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5457   switch(opc) {
5458     case Op_VectorMaskTrueCount:
5459       popcntq(dst, tmp);
5460       break;
5461     case Op_VectorMaskLastTrue:
5462       if (VM_Version::supports_lzcnt()) {
5463         lzcntq(tmp, tmp);
5464         movl(dst, 63);
5465         subl(dst, tmp);
5466       } else {
5467         movl(dst, -1);
5468         bsrq(tmp, tmp);
5469         cmov32(Assembler::notZero, dst, tmp);
5470       }
5471       break;
5472     case Op_VectorMaskFirstTrue:
5473       if (VM_Version::supports_bmi1()) {
5474         if (masklen < 32) {
5475           orl(tmp, 1 << masklen);
5476           tzcntl(dst, tmp);
5477         } else if (masklen == 32) {
5478           tzcntl(dst, tmp);
5479         } else {
5480           assert(masklen == 64, "");
5481           tzcntq(dst, tmp);
5482         }
5483       } else {
5484         if (masklen < 32) {
5485           orl(tmp, 1 << masklen);
5486           bsfl(dst, tmp);
5487         } else {
5488           assert(masklen == 32 || masklen == 64, "");
5489           movl(dst, masklen);
5490           if (masklen == 32)  {
5491             bsfl(tmp, tmp);
5492           } else {
5493             bsfq(tmp, tmp);
5494           }
5495           cmov32(Assembler::notZero, dst, tmp);
5496         }
5497       }
5498       break;
5499     case Op_VectorMaskToLong:
5500       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5501       break;
5502     default: assert(false, "Unhandled mask operation");
5503   }
5504 }
5505 
5506 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5507                                               int masklen, int masksize, int vec_enc) {
5508   assert(VM_Version::supports_popcnt(), "");
5509 
5510   if(VM_Version::supports_avx512bw()) {
5511     kmovql(tmp, mask);
5512   } else {
5513     assert(masklen <= 16, "");
5514     kmovwl(tmp, mask);
5515   }
5516 
5517   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5518   // operations needs to be clipped.
5519   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5520     andq(tmp, (1 << masklen) - 1);
5521   }
5522 
5523   vector_mask_operation_helper(opc, dst, tmp, masklen);
5524 }
5525 
5526 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5527                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5528   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5529          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5530   assert(VM_Version::supports_popcnt(), "");
5531 
5532   bool need_clip = false;
5533   switch(bt) {
5534     case T_BOOLEAN:
5535       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5536       vpxor(xtmp, xtmp, xtmp, vec_enc);
5537       vpsubb(xtmp, xtmp, mask, vec_enc);
5538       vpmovmskb(tmp, xtmp, vec_enc);
5539       need_clip = masklen < 16;
5540       break;
5541     case T_BYTE:
5542       vpmovmskb(tmp, mask, vec_enc);
5543       need_clip = masklen < 16;
5544       break;
5545     case T_SHORT:
5546       vpacksswb(xtmp, mask, mask, vec_enc);
5547       if (masklen >= 16) {
5548         vpermpd(xtmp, xtmp, 8, vec_enc);
5549       }
5550       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5551       need_clip = masklen < 16;
5552       break;
5553     case T_INT:
5554     case T_FLOAT:
5555       vmovmskps(tmp, mask, vec_enc);
5556       need_clip = masklen < 4;
5557       break;
5558     case T_LONG:
5559     case T_DOUBLE:
5560       vmovmskpd(tmp, mask, vec_enc);
5561       need_clip = masklen < 2;
5562       break;
5563     default: assert(false, "Unhandled type, %s", type2name(bt));
5564   }
5565 
5566   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5567   // operations needs to be clipped.
5568   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5569     // need_clip implies masklen < 32
5570     andq(tmp, (1 << masklen) - 1);
5571   }
5572 
5573   vector_mask_operation_helper(opc, dst, tmp, masklen);
5574 }
5575 
5576 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5577                                              Register rtmp2, int mask_len) {
5578   kmov(rtmp1, src);
5579   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5580   mov64(rtmp2, -1L);
5581   pextq(rtmp2, rtmp2, rtmp1);
5582   kmov(dst, rtmp2);
5583 }
5584 
5585 #ifdef _LP64
5586 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5587                                                     XMMRegister mask, Register rtmp, Register rscratch,
5588                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5589                                                     int vec_enc) {
5590   assert(type2aelembytes(bt) >= 4, "");
5591   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5592   address compress_perm_table = nullptr;
5593   address expand_perm_table = nullptr;
5594   if (type2aelembytes(bt) == 8) {
5595     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5596     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5597     vmovmskpd(rtmp, mask, vec_enc);
5598   } else {
5599     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5600     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5601     vmovmskps(rtmp, mask, vec_enc);
5602   }
5603   shlq(rtmp, 5); // for 32 byte permute row.
5604   if (opcode == Op_CompressV) {
5605     lea(rscratch, ExternalAddress(compress_perm_table));
5606   } else {
5607     lea(rscratch, ExternalAddress(expand_perm_table));
5608   }
5609   addptr(rtmp, rscratch);
5610   vmovdqu(permv, Address(rtmp));
5611   vpermps(dst, permv, src, Assembler::AVX_256bit);
5612   vpxor(xtmp, xtmp, xtmp, vec_enc);
5613   // Blend the result with zero vector using permute mask, each column entry
5614   // in a permute table row contains either a valid permute index or a -1 (default)
5615   // value, this can potentially be used as a blending mask after
5616   // compressing/expanding the source vector lanes.
5617   vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
5618 }
5619 #endif
5620 
5621 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5622                                                bool merge, BasicType bt, int vec_enc) {
5623   if (opcode == Op_CompressV) {
5624     switch(bt) {
5625     case T_BYTE:
5626       evpcompressb(dst, mask, src, merge, vec_enc);
5627       break;
5628     case T_CHAR:
5629     case T_SHORT:
5630       evpcompressw(dst, mask, src, merge, vec_enc);
5631       break;
5632     case T_INT:
5633       evpcompressd(dst, mask, src, merge, vec_enc);
5634       break;
5635     case T_FLOAT:
5636       evcompressps(dst, mask, src, merge, vec_enc);
5637       break;
5638     case T_LONG:
5639       evpcompressq(dst, mask, src, merge, vec_enc);
5640       break;
5641     case T_DOUBLE:
5642       evcompresspd(dst, mask, src, merge, vec_enc);
5643       break;
5644     default:
5645       fatal("Unsupported type %s", type2name(bt));
5646       break;
5647     }
5648   } else {
5649     assert(opcode == Op_ExpandV, "");
5650     switch(bt) {
5651     case T_BYTE:
5652       evpexpandb(dst, mask, src, merge, vec_enc);
5653       break;
5654     case T_CHAR:
5655     case T_SHORT:
5656       evpexpandw(dst, mask, src, merge, vec_enc);
5657       break;
5658     case T_INT:
5659       evpexpandd(dst, mask, src, merge, vec_enc);
5660       break;
5661     case T_FLOAT:
5662       evexpandps(dst, mask, src, merge, vec_enc);
5663       break;
5664     case T_LONG:
5665       evpexpandq(dst, mask, src, merge, vec_enc);
5666       break;
5667     case T_DOUBLE:
5668       evexpandpd(dst, mask, src, merge, vec_enc);
5669       break;
5670     default:
5671       fatal("Unsupported type %s", type2name(bt));
5672       break;
5673     }
5674   }
5675 }
5676 #endif
5677 
5678 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5679                                            KRegister ktmp1, int vec_enc) {
5680   if (opcode == Op_SignumVD) {
5681     vsubpd(dst, zero, one, vec_enc);
5682     // if src < 0 ? -1 : 1
5683     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5684     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5685     // if src == NaN, -0.0 or 0.0 return src.
5686     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5687     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5688   } else {
5689     assert(opcode == Op_SignumVF, "");
5690     vsubps(dst, zero, one, vec_enc);
5691     // if src < 0 ? -1 : 1
5692     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5693     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5694     // if src == NaN, -0.0 or 0.0 return src.
5695     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5696     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5697   }
5698 }
5699 
5700 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5701                                           XMMRegister xtmp1, int vec_enc) {
5702   if (opcode == Op_SignumVD) {
5703     vsubpd(dst, zero, one, vec_enc);
5704     // if src < 0 ? -1 : 1
5705     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5706     // if src == NaN, -0.0 or 0.0 return src.
5707     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5708     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5709   } else {
5710     assert(opcode == Op_SignumVF, "");
5711     vsubps(dst, zero, one, vec_enc);
5712     // if src < 0 ? -1 : 1
5713     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5714     // if src == NaN, -0.0 or 0.0 return src.
5715     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5716     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5717   }
5718 }
5719 
5720 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5721   if (VM_Version::supports_avx512bw()) {
5722     if (mask_len > 32) {
5723       kmovql(dst, src);
5724     } else {
5725       kmovdl(dst, src);
5726       if (mask_len != 32) {
5727         kshiftrdl(dst, dst, 32 - mask_len);
5728       }
5729     }
5730   } else {
5731     assert(mask_len <= 16, "");
5732     kmovwl(dst, src);
5733     if (mask_len != 16) {
5734       kshiftrwl(dst, dst, 16 - mask_len);
5735     }
5736   }
5737 }
5738 
5739 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5740   int lane_size = type2aelembytes(bt);
5741   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5742   if ((is_LP64 || lane_size < 8) &&
5743       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5744        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5745     movptr(rtmp, imm32);
5746     switch(lane_size) {
5747       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5748       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5749       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5750       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5751       fatal("Unsupported lane size %d", lane_size);
5752       break;
5753     }
5754   } else {
5755     movptr(rtmp, imm32);
5756     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5757     switch(lane_size) {
5758       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5759       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5760       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5761       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5762       fatal("Unsupported lane size %d", lane_size);
5763       break;
5764     }
5765   }
5766 }
5767 
5768 //
5769 // Following is lookup table based popcount computation algorithm:-
5770 //       Index   Bit set count
5771 //     [ 0000 ->   0,
5772 //       0001 ->   1,
5773 //       0010 ->   1,
5774 //       0011 ->   2,
5775 //       0100 ->   1,
5776 //       0101 ->   2,
5777 //       0110 ->   2,
5778 //       0111 ->   3,
5779 //       1000 ->   1,
5780 //       1001 ->   2,
5781 //       1010 ->   3,
5782 //       1011 ->   3,
5783 //       1100 ->   2,
5784 //       1101 ->   3,
5785 //       1111 ->   4 ]
5786 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5787 //     shuffle indices for lookup table access.
5788 //  b. Right shift each byte of vector lane by 4 positions.
5789 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5790 //     shuffle indices for lookup table access.
5791 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5792 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5793 //     count of all the bytes of a quadword.
5794 //  f. Perform step e. for upper 128bit vector lane.
5795 //  g. Pack the bitset count of quadwords back to double word.
5796 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5797 
5798 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5799                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5800   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5801   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5802   vpsrlw(dst, src, 4, vec_enc);
5803   vpand(dst, dst, xtmp1, vec_enc);
5804   vpand(xtmp1, src, xtmp1, vec_enc);
5805   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5806   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5807   vpshufb(dst, xtmp2, dst, vec_enc);
5808   vpaddb(dst, dst, xtmp1, vec_enc);
5809 }
5810 
5811 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5812                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5813   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5814   // Following code is as per steps e,f,g and h of above algorithm.
5815   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5816   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5817   vpsadbw(dst, dst, xtmp2, vec_enc);
5818   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5819   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5820   vpackuswb(dst, xtmp1, dst, vec_enc);
5821 }
5822 
5823 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5824                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5825   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5826   // Add the popcount of upper and lower bytes of word.
5827   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5828   vpsrlw(dst, xtmp1, 8, vec_enc);
5829   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5830   vpaddw(dst, dst, xtmp1, vec_enc);
5831 }
5832 
5833 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5834                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5835   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5836   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5837   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5838 }
5839 
5840 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5841                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5842   switch(bt) {
5843     case T_LONG:
5844       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5845       break;
5846     case T_INT:
5847       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5848       break;
5849     case T_CHAR:
5850     case T_SHORT:
5851       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5852       break;
5853     case T_BYTE:
5854     case T_BOOLEAN:
5855       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5856       break;
5857     default:
5858       fatal("Unsupported type %s", type2name(bt));
5859       break;
5860   }
5861 }
5862 
5863 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5864                                                       KRegister mask, bool merge, int vec_enc) {
5865   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5866   switch(bt) {
5867     case T_LONG:
5868       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5869       evpopcntq(dst, mask, src, merge, vec_enc);
5870       break;
5871     case T_INT:
5872       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5873       evpopcntd(dst, mask, src, merge, vec_enc);
5874       break;
5875     case T_CHAR:
5876     case T_SHORT:
5877       assert(VM_Version::supports_avx512_bitalg(), "");
5878       evpopcntw(dst, mask, src, merge, vec_enc);
5879       break;
5880     case T_BYTE:
5881     case T_BOOLEAN:
5882       assert(VM_Version::supports_avx512_bitalg(), "");
5883       evpopcntb(dst, mask, src, merge, vec_enc);
5884       break;
5885     default:
5886       fatal("Unsupported type %s", type2name(bt));
5887       break;
5888   }
5889 }
5890 
5891 #ifndef _LP64
5892 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
5893   assert(VM_Version::supports_avx512bw(), "");
5894   kmovdl(tmp, src);
5895   kunpckdql(dst, tmp, tmp);
5896 }
5897 #endif
5898 
5899 // Bit reversal algorithm first reverses the bits of each byte followed by
5900 // a byte level reversal for multi-byte primitive types (short/int/long).
5901 // Algorithm performs a lookup table access to get reverse bit sequence
5902 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5903 // is obtained by swapping the reverse bit sequences of upper and lower
5904 // nibble of a byte.
5905 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5906                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5907   if (VM_Version::supports_avx512vlbw()) {
5908 
5909     // Get the reverse bit sequence of lower nibble of each byte.
5910     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5911     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5912     evpandq(dst, xtmp2, src, vec_enc);
5913     vpshufb(dst, xtmp1, dst, vec_enc);
5914     vpsllq(dst, dst, 4, vec_enc);
5915 
5916     // Get the reverse bit sequence of upper nibble of each byte.
5917     vpandn(xtmp2, xtmp2, src, vec_enc);
5918     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5919     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5920 
5921     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5922     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5923     evporq(xtmp2, dst, xtmp2, vec_enc);
5924     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5925 
5926   } else if(vec_enc == Assembler::AVX_512bit) {
5927     // Shift based bit reversal.
5928     assert(bt == T_LONG || bt == T_INT, "");
5929 
5930     // Swap lower and upper nibble of each byte.
5931     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5932 
5933     // Swap two least and most significant bits of each nibble.
5934     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5935 
5936     // Swap adjacent pair of bits.
5937     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5938     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5939 
5940     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5941     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5942   } else {
5943     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5944     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5945 
5946     // Get the reverse bit sequence of lower nibble of each byte.
5947     vpand(dst, xtmp2, src, vec_enc);
5948     vpshufb(dst, xtmp1, dst, vec_enc);
5949     vpsllq(dst, dst, 4, vec_enc);
5950 
5951     // Get the reverse bit sequence of upper nibble of each byte.
5952     vpandn(xtmp2, xtmp2, src, vec_enc);
5953     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5954     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5955 
5956     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5957     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5958     vpor(xtmp2, dst, xtmp2, vec_enc);
5959     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5960   }
5961 }
5962 
5963 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5964                                                 XMMRegister xtmp, Register rscratch) {
5965   assert(VM_Version::supports_gfni(), "");
5966   assert(rscratch != noreg || always_reachable(mask), "missing");
5967 
5968   // Galois field instruction based bit reversal based on following algorithm.
5969   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5970   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5971   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5972   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5973 }
5974 
5975 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5976                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5977   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5978   evpandq(dst, xtmp1, src, vec_enc);
5979   vpsllq(dst, dst, nbits, vec_enc);
5980   vpandn(xtmp1, xtmp1, src, vec_enc);
5981   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5982   evporq(dst, dst, xtmp1, vec_enc);
5983 }
5984 
5985 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5986                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5987   // Shift based bit reversal.
5988   assert(VM_Version::supports_evex(), "");
5989   switch(bt) {
5990     case T_LONG:
5991       // Swap upper and lower double word of each quad word.
5992       evprorq(xtmp1, k0, src, 32, true, vec_enc);
5993       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5994       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5995       break;
5996     case T_INT:
5997       // Swap upper and lower word of each double word.
5998       evprord(xtmp1, k0, src, 16, true, vec_enc);
5999       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6000       break;
6001     case T_CHAR:
6002     case T_SHORT:
6003       // Swap upper and lower byte of each word.
6004       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6005       break;
6006     case T_BYTE:
6007       evmovdquq(dst, k0, src, true, vec_enc);
6008       break;
6009     default:
6010       fatal("Unsupported type %s", type2name(bt));
6011       break;
6012   }
6013 }
6014 
6015 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6016   if (bt == T_BYTE) {
6017     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6018       evmovdquq(dst, k0, src, true, vec_enc);
6019     } else {
6020       vmovdqu(dst, src);
6021     }
6022     return;
6023   }
6024   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6025   // pre-computed shuffle indices.
6026   switch(bt) {
6027     case T_LONG:
6028       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6029       break;
6030     case T_INT:
6031       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6032       break;
6033     case T_CHAR:
6034     case T_SHORT:
6035       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6036       break;
6037     default:
6038       fatal("Unsupported type %s", type2name(bt));
6039       break;
6040   }
6041   vpshufb(dst, src, dst, vec_enc);
6042 }
6043 
6044 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6045                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6046                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6047   assert(is_integral_type(bt), "");
6048   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6049   assert(VM_Version::supports_avx512cd(), "");
6050   switch(bt) {
6051     case T_LONG:
6052       evplzcntq(dst, ktmp, src, merge, vec_enc);
6053       break;
6054     case T_INT:
6055       evplzcntd(dst, ktmp, src, merge, vec_enc);
6056       break;
6057     case T_SHORT:
6058       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6059       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6060       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6061       vpunpckhwd(dst, xtmp1, src, vec_enc);
6062       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6063       vpackusdw(dst, xtmp2, dst, vec_enc);
6064       break;
6065     case T_BYTE:
6066       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6067       // accessing the lookup table.
6068       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6069       // accessing the lookup table.
6070       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6071       assert(VM_Version::supports_avx512bw(), "");
6072       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6073       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6074       vpand(xtmp2, dst, src, vec_enc);
6075       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6076       vpsrlw(xtmp3, src, 4, vec_enc);
6077       vpand(xtmp3, dst, xtmp3, vec_enc);
6078       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6079       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6080       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6081       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6082       break;
6083     default:
6084       fatal("Unsupported type %s", type2name(bt));
6085       break;
6086   }
6087 }
6088 
6089 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6090                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6091   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6092   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6093   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6094   // accessing the lookup table.
6095   vpand(dst, xtmp2, src, vec_enc);
6096   vpshufb(dst, xtmp1, dst, vec_enc);
6097   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6098   // accessing the lookup table.
6099   vpsrlw(xtmp3, src, 4, vec_enc);
6100   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6101   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6102   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6103   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6104   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6105   vpaddb(dst, dst, xtmp2, vec_enc);
6106   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6107 }
6108 
6109 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6110                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6111   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6112   // Add zero counts of lower byte and upper byte of a word if
6113   // upper byte holds a zero value.
6114   vpsrlw(xtmp3, src, 8, vec_enc);
6115   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6116   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6117   vpsllw(xtmp2, dst, 8, vec_enc);
6118   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6119   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6120   vpsrlw(dst, dst, 8, vec_enc);
6121 }
6122 
6123 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6124                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6125   // Since IEEE 754 floating point format represents mantissa in 1.0 format
6126   // hence biased exponent can be used to compute leading zero count as per
6127   // following formula:-
6128   // LZCNT = 32 - (biased_exp - 127)
6129   // Special handling has been introduced for Zero, Max_Int and -ve source values.
6130 
6131   // Broadcast 0xFF
6132   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6133   vpsrld(xtmp1, xtmp1, 24, vec_enc);
6134 
6135   // Extract biased exponent.
6136   vcvtdq2ps(dst, src, vec_enc);
6137   vpsrld(dst, dst, 23, vec_enc);
6138   vpand(dst, dst, xtmp1, vec_enc);
6139 
6140   // Broadcast 127.
6141   vpsrld(xtmp1, xtmp1, 1, vec_enc);
6142   // Exponent = biased_exp - 127
6143   vpsubd(dst, dst, xtmp1, vec_enc);
6144 
6145   // Exponent = Exponent  + 1
6146   vpsrld(xtmp3, xtmp1, 6, vec_enc);
6147   vpaddd(dst, dst, xtmp3, vec_enc);
6148 
6149   // Replace -ve exponent with zero, exponent is -ve when src
6150   // lane contains a zero value.
6151   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6152   vblendvps(dst, dst, xtmp2, dst, vec_enc);
6153 
6154   // Rematerialize broadcast 32.
6155   vpslld(xtmp1, xtmp3, 5, vec_enc);
6156   // Exponent is 32 if corresponding source lane contains max_int value.
6157   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
6158   // LZCNT = 32 - exponent
6159   vpsubd(dst, xtmp1, dst, vec_enc);
6160 
6161   // Replace LZCNT with a value 1 if corresponding source lane
6162   // contains max_int value.
6163   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
6164 
6165   // Replace biased_exp with 0 if source lane value is less than zero.
6166   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6167   vblendvps(dst, dst, xtmp2, src, vec_enc);
6168 }
6169 
6170 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6171                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6172   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6173   // Add zero counts of lower word and upper word of a double word if
6174   // upper word holds a zero value.
6175   vpsrld(xtmp3, src, 16, vec_enc);
6176   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6177   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6178   vpslld(xtmp2, dst, 16, vec_enc);
6179   vpaddd(xtmp2, xtmp2, dst, vec_enc);
6180   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6181   vpsrld(dst, dst, 16, vec_enc);
6182   // Add zero counts of lower doubleword and upper doubleword of a
6183   // quadword if upper doubleword holds a zero value.
6184   vpsrlq(xtmp3, src, 32, vec_enc);
6185   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6186   vpsllq(xtmp2, dst, 32, vec_enc);
6187   vpaddq(xtmp2, xtmp2, dst, vec_enc);
6188   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6189   vpsrlq(dst, dst, 32, vec_enc);
6190 }
6191 
6192 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6193                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6194                                                        Register rtmp, int vec_enc) {
6195   assert(is_integral_type(bt), "unexpected type");
6196   assert(vec_enc < Assembler::AVX_512bit, "");
6197   switch(bt) {
6198     case T_LONG:
6199       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6200       break;
6201     case T_INT:
6202       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6203       break;
6204     case T_SHORT:
6205       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6206       break;
6207     case T_BYTE:
6208       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6209       break;
6210     default:
6211       fatal("Unsupported type %s", type2name(bt));
6212       break;
6213   }
6214 }
6215 
6216 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6217   switch(bt) {
6218     case T_BYTE:
6219       vpsubb(dst, src1, src2, vec_enc);
6220       break;
6221     case T_SHORT:
6222       vpsubw(dst, src1, src2, vec_enc);
6223       break;
6224     case T_INT:
6225       vpsubd(dst, src1, src2, vec_enc);
6226       break;
6227     case T_LONG:
6228       vpsubq(dst, src1, src2, vec_enc);
6229       break;
6230     default:
6231       fatal("Unsupported type %s", type2name(bt));
6232       break;
6233   }
6234 }
6235 
6236 // Trailing zero count computation is based on leading zero count operation as per
6237 // following equation. All AVX3 targets support AVX512CD feature which offers
6238 // direct vector instruction to compute leading zero count.
6239 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6240 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6241                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6242                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6243   assert(is_integral_type(bt), "");
6244   // xtmp = -1
6245   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6246   // xtmp = xtmp + src
6247   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6248   // xtmp = xtmp & ~src
6249   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6250   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6251   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6252   vpsub(bt, dst, xtmp4, dst, vec_enc);
6253 }
6254 
6255 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6256 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6257 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6258                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6259   assert(is_integral_type(bt), "");
6260   // xtmp = 0
6261   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6262   // xtmp = 0 - src
6263   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6264   // xtmp = xtmp | src
6265   vpor(xtmp3, xtmp3, src, vec_enc);
6266   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6267   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6268   vpsub(bt, dst, xtmp1, dst, vec_enc);
6269 }
6270 
6271 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6272   Label done;
6273   Label neg_divisor_fastpath;
6274   cmpl(divisor, 0);
6275   jccb(Assembler::less, neg_divisor_fastpath);
6276   xorl(rdx, rdx);
6277   divl(divisor);
6278   jmpb(done);
6279   bind(neg_divisor_fastpath);
6280   // Fastpath for divisor < 0:
6281   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6282   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6283   movl(rdx, rax);
6284   subl(rdx, divisor);
6285   if (VM_Version::supports_bmi1()) {
6286     andnl(rax, rdx, rax);
6287   } else {
6288     notl(rdx);
6289     andl(rax, rdx);
6290   }
6291   shrl(rax, 31);
6292   bind(done);
6293 }
6294 
6295 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6296   Label done;
6297   Label neg_divisor_fastpath;
6298   cmpl(divisor, 0);
6299   jccb(Assembler::less, neg_divisor_fastpath);
6300   xorl(rdx, rdx);
6301   divl(divisor);
6302   jmpb(done);
6303   bind(neg_divisor_fastpath);
6304   // Fastpath when divisor < 0:
6305   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6306   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6307   movl(rdx, rax);
6308   subl(rax, divisor);
6309   if (VM_Version::supports_bmi1()) {
6310     andnl(rax, rax, rdx);
6311   } else {
6312     notl(rax);
6313     andl(rax, rdx);
6314   }
6315   sarl(rax, 31);
6316   andl(rax, divisor);
6317   subl(rdx, rax);
6318   bind(done);
6319 }
6320 
6321 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6322   Label done;
6323   Label neg_divisor_fastpath;
6324 
6325   cmpl(divisor, 0);
6326   jccb(Assembler::less, neg_divisor_fastpath);
6327   xorl(rdx, rdx);
6328   divl(divisor);
6329   jmpb(done);
6330   bind(neg_divisor_fastpath);
6331   // Fastpath for divisor < 0:
6332   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6333   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6334   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6335   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6336   movl(rdx, rax);
6337   subl(rax, divisor);
6338   if (VM_Version::supports_bmi1()) {
6339     andnl(rax, rax, rdx);
6340   } else {
6341     notl(rax);
6342     andl(rax, rdx);
6343   }
6344   movl(tmp, rax);
6345   shrl(rax, 31); // quotient
6346   sarl(tmp, 31);
6347   andl(tmp, divisor);
6348   subl(rdx, tmp); // remainder
6349   bind(done);
6350 }
6351 
6352 #ifdef _LP64
6353 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6354                                  XMMRegister xtmp2, Register rtmp) {
6355   if(VM_Version::supports_gfni()) {
6356     // Galois field instruction based bit reversal based on following algorithm.
6357     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6358     mov64(rtmp, 0x8040201008040201L);
6359     movq(xtmp1, src);
6360     movq(xtmp2, rtmp);
6361     gf2p8affineqb(xtmp1, xtmp2, 0);
6362     movq(dst, xtmp1);
6363   } else {
6364     // Swap even and odd numbered bits.
6365     movl(rtmp, src);
6366     andl(rtmp, 0x55555555);
6367     shll(rtmp, 1);
6368     movl(dst, src);
6369     andl(dst, 0xAAAAAAAA);
6370     shrl(dst, 1);
6371     orl(dst, rtmp);
6372 
6373     // Swap LSB and MSB 2 bits of each nibble.
6374     movl(rtmp, dst);
6375     andl(rtmp, 0x33333333);
6376     shll(rtmp, 2);
6377     andl(dst, 0xCCCCCCCC);
6378     shrl(dst, 2);
6379     orl(dst, rtmp);
6380 
6381     // Swap LSB and MSB 4 bits of each byte.
6382     movl(rtmp, dst);
6383     andl(rtmp, 0x0F0F0F0F);
6384     shll(rtmp, 4);
6385     andl(dst, 0xF0F0F0F0);
6386     shrl(dst, 4);
6387     orl(dst, rtmp);
6388   }
6389   bswapl(dst);
6390 }
6391 
6392 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6393                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6394   if(VM_Version::supports_gfni()) {
6395     // Galois field instruction based bit reversal based on following algorithm.
6396     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6397     mov64(rtmp1, 0x8040201008040201L);
6398     movq(xtmp1, src);
6399     movq(xtmp2, rtmp1);
6400     gf2p8affineqb(xtmp1, xtmp2, 0);
6401     movq(dst, xtmp1);
6402   } else {
6403     // Swap even and odd numbered bits.
6404     movq(rtmp1, src);
6405     mov64(rtmp2, 0x5555555555555555L);
6406     andq(rtmp1, rtmp2);
6407     shlq(rtmp1, 1);
6408     movq(dst, src);
6409     notq(rtmp2);
6410     andq(dst, rtmp2);
6411     shrq(dst, 1);
6412     orq(dst, rtmp1);
6413 
6414     // Swap LSB and MSB 2 bits of each nibble.
6415     movq(rtmp1, dst);
6416     mov64(rtmp2, 0x3333333333333333L);
6417     andq(rtmp1, rtmp2);
6418     shlq(rtmp1, 2);
6419     notq(rtmp2);
6420     andq(dst, rtmp2);
6421     shrq(dst, 2);
6422     orq(dst, rtmp1);
6423 
6424     // Swap LSB and MSB 4 bits of each byte.
6425     movq(rtmp1, dst);
6426     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6427     andq(rtmp1, rtmp2);
6428     shlq(rtmp1, 4);
6429     notq(rtmp2);
6430     andq(dst, rtmp2);
6431     shrq(dst, 4);
6432     orq(dst, rtmp1);
6433   }
6434   bswapq(dst);
6435 }
6436 
6437 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6438   Label done;
6439   Label neg_divisor_fastpath;
6440   cmpq(divisor, 0);
6441   jccb(Assembler::less, neg_divisor_fastpath);
6442   xorl(rdx, rdx);
6443   divq(divisor);
6444   jmpb(done);
6445   bind(neg_divisor_fastpath);
6446   // Fastpath for divisor < 0:
6447   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6448   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6449   movq(rdx, rax);
6450   subq(rdx, divisor);
6451   if (VM_Version::supports_bmi1()) {
6452     andnq(rax, rdx, rax);
6453   } else {
6454     notq(rdx);
6455     andq(rax, rdx);
6456   }
6457   shrq(rax, 63);
6458   bind(done);
6459 }
6460 
6461 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6462   Label done;
6463   Label neg_divisor_fastpath;
6464   cmpq(divisor, 0);
6465   jccb(Assembler::less, neg_divisor_fastpath);
6466   xorq(rdx, rdx);
6467   divq(divisor);
6468   jmp(done);
6469   bind(neg_divisor_fastpath);
6470   // Fastpath when divisor < 0:
6471   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6472   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6473   movq(rdx, rax);
6474   subq(rax, divisor);
6475   if (VM_Version::supports_bmi1()) {
6476     andnq(rax, rax, rdx);
6477   } else {
6478     notq(rax);
6479     andq(rax, rdx);
6480   }
6481   sarq(rax, 63);
6482   andq(rax, divisor);
6483   subq(rdx, rax);
6484   bind(done);
6485 }
6486 
6487 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6488   Label done;
6489   Label neg_divisor_fastpath;
6490   cmpq(divisor, 0);
6491   jccb(Assembler::less, neg_divisor_fastpath);
6492   xorq(rdx, rdx);
6493   divq(divisor);
6494   jmp(done);
6495   bind(neg_divisor_fastpath);
6496   // Fastpath for divisor < 0:
6497   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6498   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6499   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6500   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6501   movq(rdx, rax);
6502   subq(rax, divisor);
6503   if (VM_Version::supports_bmi1()) {
6504     andnq(rax, rax, rdx);
6505   } else {
6506     notq(rax);
6507     andq(rax, rdx);
6508   }
6509   movq(tmp, rax);
6510   shrq(rax, 63); // quotient
6511   sarq(tmp, 63);
6512   andq(tmp, divisor);
6513   subq(rdx, tmp); // remainder
6514   bind(done);
6515 }
6516 #endif
6517 
6518 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6519                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6520                                         int vlen_enc) {
6521   assert(VM_Version::supports_avx512bw(), "");
6522   // Byte shuffles are inlane operations and indices are determined using
6523   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6524   // normalized to index range 0-15. This makes sure that all the multiples
6525   // of an index value are placed at same relative position in 128 bit
6526   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6527   // will be 16th element in their respective 128 bit lanes.
6528   movl(rtmp, 16);
6529   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6530 
6531   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6532   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6533   // original shuffle indices and move the shuffled lanes corresponding to true
6534   // mask to destination vector.
6535   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6536   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6537   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6538 
6539   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6540   // and broadcasting second 128 bit lane.
6541   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6542   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6543   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6544   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6545   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6546 
6547   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6548   // and broadcasting third 128 bit lane.
6549   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6550   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6551   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6552   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6553   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6554 
6555   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6556   // and broadcasting third 128 bit lane.
6557   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6558   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6559   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6560   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6561   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6562 }
6563 
6564 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6565                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6566   if (vlen_enc == AVX_128bit) {
6567     vpermilps(dst, src, shuffle, vlen_enc);
6568   } else if (bt == T_INT) {
6569     vpermd(dst, shuffle, src, vlen_enc);
6570   } else {
6571     assert(bt == T_FLOAT, "");
6572     vpermps(dst, shuffle, src, vlen_enc);
6573   }
6574 }
6575 
6576 #ifdef _LP64
6577 void C2_MacroAssembler::load_nklass_compact_c2(Register dst, Register obj, Register index, Address::ScaleFactor scale, int disp) {
6578   // Note: Don't clobber obj anywhere in that method!
6579 
6580   // The incoming address is pointing into obj-start + klass_offset_in_bytes. We need to extract
6581   // obj-start, so that we can load from the object's mark-word instead. Usually the address
6582   // comes as obj-start in obj and klass_offset_in_bytes in disp. However, sometimes C2
6583   // emits code that pre-computes obj-start + klass_offset_in_bytes into a register, and
6584   // then passes that register as obj and 0 in disp. The following code extracts the base
6585   // and offset to load the mark-word.
6586   int offset = oopDesc::mark_offset_in_bytes() + disp - oopDesc::klass_offset_in_bytes();
6587   movq(dst, Address(obj, index, scale, offset));
6588   shrq(dst, markWord::klass_shift);
6589 }
6590 #endif