1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/globals.hpp"
  37 #include "runtime/objectMonitor.hpp"
  38 #include "runtime/stubRoutines.hpp"
  39 #include "utilities/checkedCast.hpp"
  40 #include "utilities/globalDefinitions.hpp"
  41 #include "utilities/powerOfTwo.hpp"
  42 #include "utilities/sizes.hpp"
  43 
  44 #ifdef PRODUCT
  45 #define BLOCK_COMMENT(str) /* nothing */
  46 #define STOP(error) stop(error)
  47 #else
  48 #define BLOCK_COMMENT(str) block_comment(str)
  49 #define STOP(error) block_comment(error); stop(error)
  50 #endif
  51 
  52 // C2 compiled method's prolog code.
  53 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  54 
  55   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  56   // NativeJump::patch_verified_entry will be able to patch out the entry
  57   // code safely. The push to verify stack depth is ok at 5 bytes,
  58   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  59   // stack bang then we must use the 6 byte frame allocation even if
  60   // we have no frame. :-(
  61   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  62 
  63   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  64   // Remove word for return addr
  65   framesize -= wordSize;
  66   stack_bang_size -= wordSize;
  67 
  68   // Calls to C2R adapters often do not accept exceptional returns.
  69   // We require that their callers must bang for them.  But be careful, because
  70   // some VM calls (such as call site linkage) can use several kilobytes of
  71   // stack.  But the stack safety zone should account for that.
  72   // See bugs 4446381, 4468289, 4497237.
  73   if (stack_bang_size > 0) {
  74     generate_stack_overflow_check(stack_bang_size);
  75 
  76     // We always push rbp, so that on return to interpreter rbp, will be
  77     // restored correctly and we can correct the stack.
  78     push(rbp);
  79     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  80     if (PreserveFramePointer) {
  81       mov(rbp, rsp);
  82     }
  83     // Remove word for ebp
  84     framesize -= wordSize;
  85 
  86     // Create frame
  87     if (framesize) {
  88       subptr(rsp, framesize);
  89     }
  90   } else {
  91     // Create frame (force generation of a 4 byte immediate value)
  92     subptr_imm32(rsp, framesize);
  93 
  94     // Save RBP register now.
  95     framesize -= wordSize;
  96     movptr(Address(rsp, framesize), rbp);
  97     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  98     if (PreserveFramePointer) {
  99       movptr(rbp, rsp);
 100       if (framesize > 0) {
 101         addptr(rbp, framesize);
 102       }
 103     }
 104   }
 105 
 106   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 107     framesize -= wordSize;
 108     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 109   }
 110 
 111 #ifndef _LP64
 112   // If method sets FPU control word do it now
 113   if (fp_mode_24b) {
 114     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 115   }
 116   if (UseSSE >= 2 && VerifyFPU) {
 117     verify_FPU(0, "FPU stack must be clean on entry");
 118   }
 119 #endif
 120 
 121 #ifdef ASSERT
 122   if (VerifyStackAtCalls) {
 123     Label L;
 124     push(rax);
 125     mov(rax, rsp);
 126     andptr(rax, StackAlignmentInBytes-1);
 127     cmpptr(rax, StackAlignmentInBytes-wordSize);
 128     pop(rax);
 129     jcc(Assembler::equal, L);
 130     STOP("Stack is not properly aligned!");
 131     bind(L);
 132   }
 133 #endif
 134 
 135   if (!is_stub) {
 136     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 137  #ifdef _LP64
 138     if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
 139       // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 140       Label dummy_slow_path;
 141       Label dummy_continuation;
 142       Label* slow_path = &dummy_slow_path;
 143       Label* continuation = &dummy_continuation;
 144       if (!Compile::current()->output()->in_scratch_emit_size()) {
 145         // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 146         C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 147         Compile::current()->output()->add_stub(stub);
 148         slow_path = &stub->entry();
 149         continuation = &stub->continuation();
 150       }
 151       bs->nmethod_entry_barrier(this, slow_path, continuation);
 152     }
 153 #else
 154     // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 155     bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */);
 156 #endif
 157   }
 158 }
 159 
 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 161   switch (vlen_in_bytes) {
 162     case  4: // fall-through
 163     case  8: // fall-through
 164     case 16: return Assembler::AVX_128bit;
 165     case 32: return Assembler::AVX_256bit;
 166     case 64: return Assembler::AVX_512bit;
 167 
 168     default: {
 169       ShouldNotReachHere();
 170       return Assembler::AVX_NoVec;
 171     }
 172   }
 173 }
 174 
 175 #if INCLUDE_RTM_OPT
 176 
 177 // Update rtm_counters based on abort status
 178 // input: abort_status
 179 //        rtm_counters (RTMLockingCounters*)
 180 // flags are killed
 181 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
 182 
 183   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
 184   if (PrintPreciseRTMLockingStatistics) {
 185     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
 186       Label check_abort;
 187       testl(abort_status, (1<<i));
 188       jccb(Assembler::equal, check_abort);
 189       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
 190       bind(check_abort);
 191     }
 192   }
 193 }
 194 
 195 // Branch if (random & (count-1) != 0), count is 2^n
 196 // tmp, scr and flags are killed
 197 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
 198   assert(tmp == rax, "");
 199   assert(scr == rdx, "");
 200   rdtsc(); // modifies EDX:EAX
 201   andptr(tmp, count-1);
 202   jccb(Assembler::notZero, brLabel);
 203 }
 204 
 205 // Perform abort ratio calculation, set no_rtm bit if high ratio
 206 // input:  rtm_counters_Reg (RTMLockingCounters* address)
 207 // tmpReg, rtm_counters_Reg and flags are killed
 208 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
 209                                                     Register rtm_counters_Reg,
 210                                                     RTMLockingCounters* rtm_counters,
 211                                                     Metadata* method_data) {
 212   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 213 
 214   if (RTMLockingCalculationDelay > 0) {
 215     // Delay calculation
 216     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()));
 217     testptr(tmpReg, tmpReg);
 218     jccb(Assembler::equal, L_done);
 219   }
 220   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 221   //   Aborted transactions = abort_count * 100
 222   //   All transactions = total_count *  RTMTotalCountIncrRate
 223   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 224 
 225   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 226   cmpptr(tmpReg, RTMAbortThreshold);
 227   jccb(Assembler::below, L_check_always_rtm2);
 228   imulptr(tmpReg, tmpReg, 100);
 229 
 230   Register scrReg = rtm_counters_Reg;
 231   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 232   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 233   imulptr(scrReg, scrReg, RTMAbortRatio);
 234   cmpptr(tmpReg, scrReg);
 235   jccb(Assembler::below, L_check_always_rtm1);
 236   if (method_data != nullptr) {
 237     // set rtm_state to "no rtm" in MDO
 238     mov_metadata(tmpReg, method_data);
 239     lock();
 240     orl(Address(tmpReg, MethodData::rtm_state_offset()), NoRTM);
 241   }
 242   jmpb(L_done);
 243   bind(L_check_always_rtm1);
 244   // Reload RTMLockingCounters* address
 245   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 246   bind(L_check_always_rtm2);
 247   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 248   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 249   jccb(Assembler::below, L_done);
 250   if (method_data != nullptr) {
 251     // set rtm_state to "always rtm" in MDO
 252     mov_metadata(tmpReg, method_data);
 253     lock();
 254     orl(Address(tmpReg, MethodData::rtm_state_offset()), UseRTM);
 255   }
 256   bind(L_done);
 257 }
 258 
 259 // Update counters and perform abort ratio calculation
 260 // input:  abort_status_Reg
 261 // rtm_counters_Reg, flags are killed
 262 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 263                                       Register rtm_counters_Reg,
 264                                       RTMLockingCounters* rtm_counters,
 265                                       Metadata* method_data,
 266                                       bool profile_rtm) {
 267 
 268   assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 269   // update rtm counters based on rax value at abort
 270   // reads abort_status_Reg, updates flags
 271   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 272   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 273   if (profile_rtm) {
 274     // Save abort status because abort_status_Reg is used by following code.
 275     if (RTMRetryCount > 0) {
 276       push(abort_status_Reg);
 277     }
 278     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 279     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 280     // restore abort status
 281     if (RTMRetryCount > 0) {
 282       pop(abort_status_Reg);
 283     }
 284   }
 285 }
 286 
 287 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 288 // inputs: retry_count_Reg
 289 //       : abort_status_Reg
 290 // output: retry_count_Reg decremented by 1
 291 // flags are killed
 292 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 293   Label doneRetry;
 294   assert(abort_status_Reg == rax, "");
 295   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 296   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 297   // if reason is in 0x6 and retry count != 0 then retry
 298   andptr(abort_status_Reg, 0x6);
 299   jccb(Assembler::zero, doneRetry);
 300   testl(retry_count_Reg, retry_count_Reg);
 301   jccb(Assembler::zero, doneRetry);
 302   pause();
 303   decrementl(retry_count_Reg);
 304   jmp(retryLabel);
 305   bind(doneRetry);
 306 }
 307 
 308 // Spin and retry if lock is busy,
 309 // inputs: box_Reg (monitor address)
 310 //       : retry_count_Reg
 311 // output: retry_count_Reg decremented by 1
 312 //       : clear z flag if retry count exceeded
 313 // tmp_Reg, scr_Reg, flags are killed
 314 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 315                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 316   Label SpinLoop, SpinExit, doneRetry;
 317   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 318 
 319   testl(retry_count_Reg, retry_count_Reg);
 320   jccb(Assembler::zero, doneRetry);
 321   decrementl(retry_count_Reg);
 322   movptr(scr_Reg, RTMSpinLoopCount);
 323 
 324   bind(SpinLoop);
 325   pause();
 326   decrementl(scr_Reg);
 327   jccb(Assembler::lessEqual, SpinExit);
 328   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 329   testptr(tmp_Reg, tmp_Reg);
 330   jccb(Assembler::notZero, SpinLoop);
 331 
 332   bind(SpinExit);
 333   jmp(retryLabel);
 334   bind(doneRetry);
 335   incrementl(retry_count_Reg); // clear z flag
 336 }
 337 
 338 // Use RTM for normal stack locks
 339 // Input: objReg (object to lock)
 340 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 341                                          Register retry_on_abort_count_Reg,
 342                                          RTMLockingCounters* stack_rtm_counters,
 343                                          Metadata* method_data, bool profile_rtm,
 344                                          Label& DONE_LABEL, Label& IsInflated) {
 345   assert(UseRTMForStackLocks, "why call this otherwise?");
 346   assert(tmpReg == rax, "");
 347   assert(scrReg == rdx, "");
 348   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 349 
 350   if (RTMRetryCount > 0) {
 351     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 352     bind(L_rtm_retry);
 353   }
 354   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 355   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral
 356   jcc(Assembler::notZero, IsInflated);
 357 
 358   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 359     Label L_noincrement;
 360     if (RTMTotalCountIncrRate > 1) {
 361       // tmpReg, scrReg and flags are killed
 362       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 363     }
 364     assert(stack_rtm_counters != nullptr, "should not be null when profiling RTM");
 365     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 366     bind(L_noincrement);
 367   }
 368   xbegin(L_on_abort);
 369   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 370   andptr(tmpReg, markWord::lock_mask_in_place);     // look at 2 lock bits
 371   cmpptr(tmpReg, markWord::unlocked_value);         // bits = 01 unlocked
 372   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 373 
 374   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 375   if (UseRTMXendForLockBusy) {
 376     xend();
 377     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 378     jmp(L_decrement_retry);
 379   }
 380   else {
 381     xabort(0);
 382   }
 383   bind(L_on_abort);
 384   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 385     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 386   }
 387   bind(L_decrement_retry);
 388   if (RTMRetryCount > 0) {
 389     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 390     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 391   }
 392 }
 393 
 394 // Use RTM for inflating locks
 395 // inputs: objReg (object to lock)
 396 //         boxReg (on-stack box address (displaced header location) - KILLED)
 397 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 398 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 399                                             Register scrReg, Register retry_on_busy_count_Reg,
 400                                             Register retry_on_abort_count_Reg,
 401                                             RTMLockingCounters* rtm_counters,
 402                                             Metadata* method_data, bool profile_rtm,
 403                                             Label& DONE_LABEL) {
 404   assert(UseRTMLocking, "why call this otherwise?");
 405   assert(tmpReg == rax, "");
 406   assert(scrReg == rdx, "");
 407   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 408   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 409 
 410   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 411   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 412 
 413   if (RTMRetryCount > 0) {
 414     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 415     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 416     bind(L_rtm_retry);
 417   }
 418   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 419     Label L_noincrement;
 420     if (RTMTotalCountIncrRate > 1) {
 421       // tmpReg, scrReg and flags are killed
 422       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 423     }
 424     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 425     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 426     bind(L_noincrement);
 427   }
 428   xbegin(L_on_abort);
 429   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 430   movptr(tmpReg, Address(tmpReg, owner_offset));
 431   testptr(tmpReg, tmpReg);
 432   jcc(Assembler::zero, DONE_LABEL);
 433   if (UseRTMXendForLockBusy) {
 434     xend();
 435     jmp(L_decrement_retry);
 436   }
 437   else {
 438     xabort(0);
 439   }
 440   bind(L_on_abort);
 441   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 442   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 443     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 444   }
 445   if (RTMRetryCount > 0) {
 446     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 447     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 448   }
 449 
 450   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 451   testptr(tmpReg, tmpReg) ;
 452   jccb(Assembler::notZero, L_decrement_retry) ;
 453 
 454   // Appears unlocked - try to swing _owner from null to non-null.
 455   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 456 #ifdef _LP64
 457   Register threadReg = r15_thread;
 458 #else
 459   get_thread(scrReg);
 460   Register threadReg = scrReg;
 461 #endif
 462   lock();
 463   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 464 
 465   if (RTMRetryCount > 0) {
 466     // success done else retry
 467     jccb(Assembler::equal, DONE_LABEL) ;
 468     bind(L_decrement_retry);
 469     // Spin and retry if lock is busy.
 470     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 471   }
 472   else {
 473     bind(L_decrement_retry);
 474   }
 475 }
 476 
 477 #endif //  INCLUDE_RTM_OPT
 478 
 479 // fast_lock and fast_unlock used by C2
 480 
 481 // Because the transitions from emitted code to the runtime
 482 // monitorenter/exit helper stubs are so slow it's critical that
 483 // we inline both the stack-locking fast path and the inflated fast path.
 484 //
 485 // See also: cmpFastLock and cmpFastUnlock.
 486 //
 487 // What follows is a specialized inline transliteration of the code
 488 // in enter() and exit(). If we're concerned about I$ bloat another
 489 // option would be to emit TrySlowEnter and TrySlowExit methods
 490 // at startup-time.  These methods would accept arguments as
 491 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 492 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 493 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 494 // In practice, however, the # of lock sites is bounded and is usually small.
 495 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 496 // if the processor uses simple bimodal branch predictors keyed by EIP
 497 // Since the helper routines would be called from multiple synchronization
 498 // sites.
 499 //
 500 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 501 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 502 // to those specialized methods.  That'd give us a mostly platform-independent
 503 // implementation that the JITs could optimize and inline at their pleasure.
 504 // Done correctly, the only time we'd need to cross to native could would be
 505 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 506 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 507 // (b) explicit barriers or fence operations.
 508 //
 509 // TODO:
 510 //
 511 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 512 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 513 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 514 //    the lock operators would typically be faster than reifying Self.
 515 //
 516 // *  Ideally I'd define the primitives as:
 517 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 518 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 519 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 520 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 521 //    Furthermore the register assignments are overconstrained, possibly resulting in
 522 //    sub-optimal code near the synchronization site.
 523 //
 524 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 525 //    Alternately, use a better sp-proximity test.
 526 //
 527 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 528 //    Either one is sufficient to uniquely identify a thread.
 529 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 530 //
 531 // *  Intrinsify notify() and notifyAll() for the common cases where the
 532 //    object is locked by the calling thread but the waitlist is empty.
 533 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 534 //
 535 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 536 //    But beware of excessive branch density on AMD Opterons.
 537 //
 538 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 539 //    or failure of the fast path.  If the fast path fails then we pass
 540 //    control to the slow path, typically in C.  In fast_lock and
 541 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 542 //    will emit a conditional branch immediately after the node.
 543 //    So we have branches to branches and lots of ICC.ZF games.
 544 //    Instead, it might be better to have C2 pass a "FailureLabel"
 545 //    into fast_lock and fast_unlock.  In the case of success, control
 546 //    will drop through the node.  ICC.ZF is undefined at exit.
 547 //    In the case of failure, the node will branch directly to the
 548 //    FailureLabel
 549 
 550 
 551 // obj: object to lock
 552 // box: on-stack box address (displaced header location) - KILLED
 553 // rax,: tmp -- KILLED
 554 // scr: tmp -- KILLED
 555 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 556                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 557                                  RTMLockingCounters* rtm_counters,
 558                                  RTMLockingCounters* stack_rtm_counters,
 559                                  Metadata* method_data,
 560                                  bool use_rtm, bool profile_rtm) {
 561   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 562   // Ensure the register assignments are disjoint
 563   assert(tmpReg == rax, "");
 564 
 565   if (use_rtm) {
 566     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 567   } else {
 568     assert(cx1Reg == noreg, "");
 569     assert(cx2Reg == noreg, "");
 570     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 571   }
 572 
 573   // Possible cases that we'll encounter in fast_lock
 574   // ------------------------------------------------
 575   // * Inflated
 576   //    -- unlocked
 577   //    -- Locked
 578   //       = by self
 579   //       = by other
 580   // * neutral
 581   // * stack-locked
 582   //    -- by self
 583   //       = sp-proximity test hits
 584   //       = sp-proximity test generates false-negative
 585   //    -- by other
 586   //
 587 
 588   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 589 
 590   if (DiagnoseSyncOnValueBasedClasses != 0) {
 591     load_klass(tmpReg, objReg, scrReg);
 592     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 593     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 594     jcc(Assembler::notZero, DONE_LABEL);
 595   }
 596 
 597 #if INCLUDE_RTM_OPT
 598   if (UseRTMForStackLocks && use_rtm) {
 599     assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive");
 600     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 601                       stack_rtm_counters, method_data, profile_rtm,
 602                       DONE_LABEL, IsInflated);
 603   }
 604 #endif // INCLUDE_RTM_OPT
 605 
 606   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 607   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 608   jcc(Assembler::notZero, IsInflated);
 609 
 610   if (LockingMode == LM_MONITOR) {
 611     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 612     testptr(objReg, objReg);
 613   } else {
 614     assert(LockingMode == LM_LEGACY, "must be");
 615     // Attempt stack-locking ...
 616     orptr (tmpReg, markWord::unlocked_value);
 617     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 618     lock();
 619     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 620     jcc(Assembler::equal, COUNT);           // Success
 621 
 622     // Recursive locking.
 623     // The object is stack-locked: markword contains stack pointer to BasicLock.
 624     // Locked by current thread if difference with current SP is less than one page.
 625     subptr(tmpReg, rsp);
 626     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 627     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) );
 628     movptr(Address(boxReg, 0), tmpReg);
 629   }
 630   jmp(DONE_LABEL);
 631 
 632   bind(IsInflated);
 633   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 634 
 635 #if INCLUDE_RTM_OPT
 636   // Use the same RTM locking code in 32- and 64-bit VM.
 637   if (use_rtm) {
 638     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 639                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 640   } else {
 641 #endif // INCLUDE_RTM_OPT
 642 
 643 #ifndef _LP64
 644   // The object is inflated.
 645 
 646   // boxReg refers to the on-stack BasicLock in the current frame.
 647   // We'd like to write:
 648   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 649   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 650   // additional latency as we have another ST in the store buffer that must drain.
 651 
 652   // avoid ST-before-CAS
 653   // register juggle because we need tmpReg for cmpxchgptr below
 654   movptr(scrReg, boxReg);
 655   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 656 
 657   // Optimistic form: consider XORL tmpReg,tmpReg
 658   movptr(tmpReg, NULL_WORD);
 659 
 660   // Appears unlocked - try to swing _owner from null to non-null.
 661   // Ideally, I'd manifest "Self" with get_thread and then attempt
 662   // to CAS the register containing Self into m->Owner.
 663   // But we don't have enough registers, so instead we can either try to CAS
 664   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 665   // we later store "Self" into m->Owner.  Transiently storing a stack address
 666   // (rsp or the address of the box) into  m->owner is harmless.
 667   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 668   lock();
 669   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 670   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 671   // If we weren't able to swing _owner from null to the BasicLock
 672   // then take the slow path.
 673   jccb  (Assembler::notZero, NO_COUNT);
 674   // update _owner from BasicLock to thread
 675   get_thread (scrReg);                    // beware: clobbers ICCs
 676   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 677   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 678 
 679   // If the CAS fails we can either retry or pass control to the slow path.
 680   // We use the latter tactic.
 681   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 682   // If the CAS was successful ...
 683   //   Self has acquired the lock
 684   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 685   // Intentional fall-through into DONE_LABEL ...
 686 #else // _LP64
 687   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 688   movq(scrReg, tmpReg);
 689   xorq(tmpReg, tmpReg);
 690   lock();
 691   cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 692   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 693   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 694   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 695   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 696   jccb(Assembler::equal, COUNT);          // CAS above succeeded; propagate ZF = 1 (success)
 697 
 698   cmpptr(thread, rax);                // Check if we are already the owner (recursive lock)
 699   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 700   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 701   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 702 #endif // _LP64
 703 #if INCLUDE_RTM_OPT
 704   } // use_rtm()
 705 #endif
 706   bind(DONE_LABEL);
 707 
 708   // ZFlag == 1 count in fast path
 709   // ZFlag == 0 count in slow path
 710   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 711 
 712   bind(COUNT);
 713   // Count monitors in fast path
 714   increment(Address(thread, JavaThread::held_monitor_count_offset()));
 715 
 716   xorl(tmpReg, tmpReg); // Set ZF == 1
 717 
 718   bind(NO_COUNT);
 719 
 720   // At NO_COUNT the icc ZFlag is set as follows ...
 721   // fast_unlock uses the same protocol.
 722   // ZFlag == 1 -> Success
 723   // ZFlag == 0 -> Failure - force control through the slow path
 724 }
 725 
 726 // obj: object to unlock
 727 // box: box address (displaced header location), killed.  Must be EAX.
 728 // tmp: killed, cannot be obj nor box.
 729 //
 730 // Some commentary on balanced locking:
 731 //
 732 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 733 // Methods that don't have provably balanced locking are forced to run in the
 734 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 735 // The interpreter provides two properties:
 736 // I1:  At return-time the interpreter automatically and quietly unlocks any
 737 //      objects acquired the current activation (frame).  Recall that the
 738 //      interpreter maintains an on-stack list of locks currently held by
 739 //      a frame.
 740 // I2:  If a method attempts to unlock an object that is not held by the
 741 //      the frame the interpreter throws IMSX.
 742 //
 743 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 744 // B() doesn't have provably balanced locking so it runs in the interpreter.
 745 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 746 // is still locked by A().
 747 //
 748 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 749 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 750 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 751 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 752 // Arguably given that the spec legislates the JNI case as undefined our implementation
 753 // could reasonably *avoid* checking owner in fast_unlock().
 754 // In the interest of performance we elide m->Owner==Self check in unlock.
 755 // A perfectly viable alternative is to elide the owner check except when
 756 // Xcheck:jni is enabled.
 757 
 758 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 759   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 760   assert(boxReg == rax, "");
 761   assert_different_registers(objReg, boxReg, tmpReg);
 762 
 763   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 764 
 765 #if INCLUDE_RTM_OPT
 766   if (UseRTMForStackLocks && use_rtm) {
 767     assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive");
 768     Label L_regular_unlock;
 769     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 770     andptr(tmpReg, markWord::lock_mask_in_place);                     // look at 2 lock bits
 771     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 01 unlocked
 772     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 773     xend();                                                           // otherwise end...
 774     jmp(DONE_LABEL);                                                  // ... and we're done
 775     bind(L_regular_unlock);
 776   }
 777 #endif
 778 
 779   if (LockingMode == LM_LEGACY) {
 780     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 781     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 782   }
 783   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 784   if (LockingMode != LM_MONITOR) {
 785     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 786     jcc(Assembler::zero, Stacked);
 787   }
 788 
 789   // It's inflated.
 790 
 791 #if INCLUDE_RTM_OPT
 792   if (use_rtm) {
 793     Label L_regular_inflated_unlock;
 794     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 795     movptr(boxReg, Address(tmpReg, owner_offset));
 796     testptr(boxReg, boxReg);
 797     jccb(Assembler::notZero, L_regular_inflated_unlock);
 798     xend();
 799     jmp(DONE_LABEL);
 800     bind(L_regular_inflated_unlock);
 801   }
 802 #endif
 803 
 804   // Despite our balanced locking property we still check that m->_owner == Self
 805   // as java routines or native JNI code called by this thread might
 806   // have released the lock.
 807   // Refer to the comments in synchronizer.cpp for how we might encode extra
 808   // state in _succ so we can avoid fetching EntryList|cxq.
 809   //
 810   // If there's no contention try a 1-0 exit.  That is, exit without
 811   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 812   // we detect and recover from the race that the 1-0 exit admits.
 813   //
 814   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 815   // before it STs null into _owner, releasing the lock.  Updates
 816   // to data protected by the critical section must be visible before
 817   // we drop the lock (and thus before any other thread could acquire
 818   // the lock and observe the fields protected by the lock).
 819   // IA32's memory-model is SPO, so STs are ordered with respect to
 820   // each other and there's no need for an explicit barrier (fence).
 821   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 822 #ifndef _LP64
 823   // Note that we could employ various encoding schemes to reduce
 824   // the number of loads below (currently 4) to just 2 or 3.
 825   // Refer to the comments in synchronizer.cpp.
 826   // In practice the chain of fetches doesn't seem to impact performance, however.
 827   xorptr(boxReg, boxReg);
 828   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 829   jccb  (Assembler::notZero, DONE_LABEL);
 830   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 831   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 832   jccb  (Assembler::notZero, DONE_LABEL);
 833   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 834   jmpb  (DONE_LABEL);
 835 #else // _LP64
 836   // It's inflated
 837   Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath;
 838 
 839   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 840   jccb(Assembler::equal, LNotRecursive);
 841 
 842   // Recursive inflated unlock
 843   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 844   jmpb(LSuccess);
 845 
 846   bind(LNotRecursive);
 847   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 848   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 849   jccb  (Assembler::notZero, CheckSucc);
 850   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 851   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 852   jmpb  (DONE_LABEL);
 853 
 854   // Try to avoid passing control into the slow_path ...
 855   bind  (CheckSucc);
 856 
 857   // The following optional optimization can be elided if necessary
 858   // Effectively: if (succ == null) goto slow path
 859   // The code reduces the window for a race, however,
 860   // and thus benefits performance.
 861   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 862   jccb  (Assembler::zero, LGoSlowPath);
 863 
 864   xorptr(boxReg, boxReg);
 865   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 866   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 867 
 868   // Memory barrier/fence
 869   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 870   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 871   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 872   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 873   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 874   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 875   lock(); addl(Address(rsp, 0), 0);
 876 
 877   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 878   jccb  (Assembler::notZero, LSuccess);
 879 
 880   // Rare inopportune interleaving - race.
 881   // The successor vanished in the small window above.
 882   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 883   // We need to ensure progress and succession.
 884   // Try to reacquire the lock.
 885   // If that fails then the new owner is responsible for succession and this
 886   // thread needs to take no further action and can exit via the fast path (success).
 887   // If the re-acquire succeeds then pass control into the slow path.
 888   // As implemented, this latter mode is horrible because we generated more
 889   // coherence traffic on the lock *and* artificially extended the critical section
 890   // length while by virtue of passing control into the slow path.
 891 
 892   // box is really RAX -- the following CMPXCHG depends on that binding
 893   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 894   lock();
 895   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 896   // There's no successor so we tried to regrab the lock.
 897   // If that didn't work, then another thread grabbed the
 898   // lock so we're done (and exit was a success).
 899   jccb  (Assembler::notEqual, LSuccess);
 900   // Intentional fall-through into slow path
 901 
 902   bind  (LGoSlowPath);
 903   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 904   jmpb  (DONE_LABEL);
 905 
 906   bind  (LSuccess);
 907   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 908   jmpb  (DONE_LABEL);
 909 
 910 #endif
 911   if (LockingMode == LM_LEGACY) {
 912     bind  (Stacked);
 913     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 914     lock();
 915     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 916     // Intentional fall-thru into DONE_LABEL
 917   }
 918 
 919   bind(DONE_LABEL);
 920 
 921   // ZFlag == 1 count in fast path
 922   // ZFlag == 0 count in slow path
 923   jccb(Assembler::notZero, NO_COUNT);
 924 
 925   bind(COUNT);
 926   // Count monitors in fast path
 927 #ifndef _LP64
 928   get_thread(tmpReg);
 929   decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 930 #else // _LP64
 931   decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 932 #endif
 933 
 934   xorl(tmpReg, tmpReg); // Set ZF == 1
 935 
 936   bind(NO_COUNT);
 937 }
 938 
 939 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 940                                               Register t, Register thread) {
 941   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 942   assert(rax_reg == rax, "Used for CAS");
 943   assert_different_registers(obj, box, rax_reg, t, thread);
 944 
 945   // Handle inflated monitor.
 946   Label inflated;
 947   // Finish fast lock successfully. ZF value is irrelevant.
 948   Label locked;
 949   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 950   Label slow_path;
 951 
 952   if (UseObjectMonitorTable) {
 953     // Clear cache in case fast locking succeeds.
 954     movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
 955   }
 956 
 957   if (DiagnoseSyncOnValueBasedClasses != 0) {
 958     load_klass(rax_reg, obj, t);
 959     movl(rax_reg, Address(rax_reg, Klass::access_flags_offset()));
 960     testl(rax_reg, JVM_ACC_IS_VALUE_BASED_CLASS);
 961     jcc(Assembler::notZero, slow_path);
 962   }
 963 
 964   const Register mark = t;
 965 
 966   { // Lightweight Lock
 967 
 968     Label push;
 969 
 970     const Register top = UseObjectMonitorTable ? rax_reg : box;
 971 
 972     // Load the mark.
 973     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 974 
 975     // Prefetch top.
 976     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 977 
 978     // Check for monitor (0b10).
 979     testptr(mark, markWord::monitor_value);
 980     jcc(Assembler::notZero, inflated);
 981 
 982     // Check if lock-stack is full.
 983     cmpl(top, LockStack::end_offset() - 1);
 984     jcc(Assembler::greater, slow_path);
 985 
 986     // Check if recursive.
 987     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 988     jccb(Assembler::equal, push);
 989 
 990     // Try to lock. Transition lock bits 0b01 => 0b00
 991     movptr(rax_reg, mark);
 992     orptr(rax_reg, markWord::unlocked_value);
 993     andptr(mark, ~(int32_t)markWord::unlocked_value);
 994     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 995     jcc(Assembler::notEqual, slow_path);
 996 
 997     if (UseObjectMonitorTable) {
 998       // Need to reload top, clobbered by CAS.
 999       movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
1000     }
1001     bind(push);
1002     // After successful lock, push object on lock-stack.
1003     movptr(Address(thread, top), obj);
1004     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
1005     jmpb(locked);
1006   }
1007 
1008   { // Handle inflated monitor.
1009     bind(inflated);
1010 
1011     const Register monitor = t;
1012 
1013     if (!UseObjectMonitorTable) {
1014       assert(mark == monitor, "should be the same here");
1015     } else {
1016       // Uses ObjectMonitorTable.  Look for the monitor in the om_cache.
1017       if (OMCacheHitRate) increment(Address(thread, JavaThread::lock_lookup_offset()));
1018 
1019       // Fetch ObjectMonitor* from the cache or take the slow-path.
1020       Label monitor_found;
1021 
1022       // Load cache address
1023       lea(t, Address(thread, JavaThread::om_cache_oops_offset()));
1024 
1025       const int num_unrolled = MIN2(OMC2UnrollCacheEntries, OMCacheSize);
1026       for (int i = 0; i < num_unrolled; i++) {
1027         cmpptr(obj, Address(t));
1028         jccb(Assembler::equal, monitor_found);
1029         if (i + 1 != num_unrolled) {
1030           increment(t, in_bytes(OMCache::oop_to_oop_difference()));
1031         }
1032       }
1033 
1034       if (num_unrolled == 0 || (OMC2UnrollCacheLookupLoopTail && num_unrolled != OMCacheSize)) {
1035         if (num_unrolled != 0) {
1036           // Loop after unrolling, advance iterator.
1037           increment(t, in_bytes(OMCache::oop_to_oop_difference()));
1038         }
1039 
1040         Label loop;
1041 
1042         // Search for obj in cache.
1043         bind(loop);
1044 
1045         // Check for match.
1046         cmpptr(obj, Address(t));
1047         jccb(Assembler::equal, monitor_found);
1048 
1049         // Search until null encountered, guaranteed _null_sentinel at end.
1050         cmpptr(Address(t), 1);
1051         jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0
1052         increment(t, in_bytes(OMCache::oop_to_oop_difference()));
1053         jmpb(loop);
1054       } else {
1055         jmp(slow_path);
1056       }
1057 
1058       // Cache hit.
1059       bind(monitor_found);
1060       movptr(monitor, Address(t, OMCache::oop_to_monitor_difference()));
1061       if (OMCacheHitRate) increment(Address(thread, JavaThread::lock_hit_offset()));
1062     }
1063     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
1064     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
1065     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
1066 
1067     Label monitor_locked;
1068     // Lock the monitor.
1069 
1070     // CAS owner (null => current thread).
1071     xorptr(rax_reg, rax_reg);
1072     lock(); cmpxchgptr(thread, owner_address);
1073     jccb(Assembler::equal, monitor_locked);
1074 
1075     // Check if recursive.
1076     cmpptr(thread, rax_reg);
1077     jccb(Assembler::notEqual, slow_path);
1078 
1079     // Recursive.
1080     increment(recursions_address);
1081 
1082     bind(monitor_locked);
1083     if (UseObjectMonitorTable) {
1084       // Cache the monitor for unlock
1085       movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
1086     }
1087   }
1088 
1089   bind(locked);
1090   increment(Address(thread, JavaThread::held_monitor_count_offset()));
1091   // Set ZF = 1
1092   xorl(rax_reg, rax_reg);
1093 
1094 #ifdef ASSERT
1095   // Check that locked label is reached with ZF set.
1096   Label zf_correct;
1097   Label zf_bad_zero;
1098   jcc(Assembler::zero, zf_correct);
1099   jmp(zf_bad_zero);
1100 #endif
1101 
1102   bind(slow_path);
1103 #ifdef ASSERT
1104   // Check that slow_path label is reached with ZF not set.
1105   jcc(Assembler::notZero, zf_correct);
1106   stop("Fast Lock ZF != 0");
1107   bind(zf_bad_zero);
1108   stop("Fast Lock ZF != 1");
1109   bind(zf_correct);
1110 #endif
1111   // C2 uses the value of ZF to determine the continuation.
1112 }
1113 
1114 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
1115   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
1116   assert(reg_rax == rax, "Used for CAS");
1117   assert_different_registers(obj, reg_rax, t);
1118 
1119   // Handle inflated monitor.
1120   Label inflated, inflated_check_lock_stack;
1121   // Finish fast unlock successfully.  MUST jump with ZF == 1
1122   Label unlocked;
1123 
1124   // Assume success.
1125   decrement(Address(thread, JavaThread::held_monitor_count_offset()));
1126 
1127   const Register mark = t;
1128   const Register monitor = t;
1129   const Register top = UseObjectMonitorTable ? t : reg_rax;
1130   const Register box = reg_rax;
1131 
1132   Label dummy;
1133   C2FastUnlockLightweightStub* stub = nullptr;
1134 
1135   if (!Compile::current()->output()->in_scratch_emit_size()) {
1136     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
1137     Compile::current()->output()->add_stub(stub);
1138   }
1139 
1140   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
1141   Label& check_successor = stub == nullptr ? dummy : stub->check_successor();
1142   Label& slow_path = stub == nullptr ? dummy : stub->slow_path();
1143 
1144   { // Lightweight Unlock
1145 
1146     // Load top.
1147     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
1148 
1149     if (!UseObjectMonitorTable) {
1150       // Prefetch mark.
1151       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
1152     }
1153 
1154     // Check if obj is top of lock-stack.
1155     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
1156     // Top of lock stack was not obj. Must be monitor.
1157     jcc(Assembler::notEqual, inflated_check_lock_stack);
1158 
1159     // Pop lock-stack.
1160     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
1161     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
1162 
1163     // Check if recursive.
1164     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
1165     jcc(Assembler::equal, unlocked);
1166 
1167     // We elide the monitor check, let the CAS fail instead.
1168 
1169     if (UseObjectMonitorTable) {
1170       // Load mark.
1171       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
1172     }
1173 
1174     // Try to unlock. Transition lock bits 0b00 => 0b01
1175     movptr(reg_rax, mark);
1176     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
1177     orptr(mark, markWord::unlocked_value);
1178     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
1179     jcc(Assembler::notEqual, push_and_slow_path);
1180     jmp(unlocked);
1181   }
1182 
1183 
1184   { // Handle inflated monitor.
1185     bind(inflated_check_lock_stack);
1186 #ifdef ASSERT
1187     Label check_done;
1188     subl(top, oopSize);
1189     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
1190     jcc(Assembler::below, check_done);
1191     cmpptr(obj, Address(thread, top));
1192     jccb(Assembler::notEqual, inflated_check_lock_stack);
1193     stop("Fast Unlock lock on stack");
1194     bind(check_done);
1195     if (UseObjectMonitorTable) {
1196       movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
1197     }
1198     testptr(mark, markWord::monitor_value);
1199     jccb(Assembler::notZero, inflated);
1200     stop("Fast Unlock not monitor");
1201 #endif
1202 
1203     bind(inflated);
1204 
1205     if (!UseObjectMonitorTable) {
1206       assert(mark == monitor, "should be the same here");
1207     } else {
1208       // Uses ObjectMonitorTable.  Look for the monitor in our BasicLock on the stack.
1209       if (OMCacheHitRate) increment(Address(thread, JavaThread::unlock_lookup_offset()));
1210       movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
1211       // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
1212       cmpptr(monitor, alignof(ObjectMonitor*));
1213       jcc(Assembler::below, slow_path);
1214 
1215       if (OMCacheHitRate) increment(Address(thread, JavaThread::unlock_hit_offset()));
1216     }
1217     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
1218     const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
1219     const Address cxq_address{monitor, ObjectMonitor::cxq_offset() - monitor_tag};
1220     const Address EntryList_address{monitor, ObjectMonitor::EntryList_offset() - monitor_tag};
1221     const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
1222 
1223     Label recursive;
1224 
1225     // Check if recursive.
1226     cmpptr(recursions_address, 0);
1227     jccb(Assembler::notEqual, recursive);
1228 
1229     // Check if the entry lists are empty.
1230     movptr(reg_rax, cxq_address);
1231     orptr(reg_rax, EntryList_address);
1232     jcc(Assembler::notZero, check_successor);
1233 
1234     // Release lock.
1235     movptr(owner_address, NULL_WORD);
1236     jmpb(unlocked);
1237 
1238     // Recursive unlock.
1239     bind(recursive);
1240     decrement(recursions_address);
1241     xorl(t, t);
1242   }
1243 
1244   bind(unlocked);
1245   if (stub != nullptr) {
1246     bind(stub->unlocked_continuation());
1247   }
1248 
1249 #ifdef ASSERT
1250   // Check that unlocked label is reached with ZF set.
1251   Label zf_correct;
1252   jcc(Assembler::zero, zf_correct);
1253   stop("Fast Unlock ZF != 1");
1254 #endif
1255 
1256   if (stub != nullptr) {
1257     bind(stub->slow_path_continuation());
1258   }
1259 #ifdef ASSERT
1260   // Check that stub->continuation() label is reached with ZF not set.
1261   jccb(Assembler::notZero, zf_correct);
1262   stop("Fast Unlock ZF != 0");
1263   bind(zf_correct);
1264 #endif
1265   // C2 uses the value of ZF to determine the continuation.
1266 }
1267 
1268 //-------------------------------------------------------------------------------------------
1269 // Generic instructions support for use in .ad files C2 code generation
1270 
1271 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
1272   if (dst != src) {
1273     movdqu(dst, src);
1274   }
1275   if (opcode == Op_AbsVD) {
1276     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
1277   } else {
1278     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
1279     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1280   }
1281 }
1282 
1283 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
1284   if (opcode == Op_AbsVD) {
1285     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
1286   } else {
1287     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
1288     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
1289   }
1290 }
1291 
1292 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
1293   if (dst != src) {
1294     movdqu(dst, src);
1295   }
1296   if (opcode == Op_AbsVF) {
1297     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
1298   } else {
1299     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
1300     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1301   }
1302 }
1303 
1304 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
1305   if (opcode == Op_AbsVF) {
1306     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
1307   } else {
1308     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
1309     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
1310   }
1311 }
1312 
1313 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
1314   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1315   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
1316 
1317   if (opcode == Op_MinV) {
1318     if (elem_bt == T_BYTE) {
1319       pminsb(dst, src);
1320     } else if (elem_bt == T_SHORT) {
1321       pminsw(dst, src);
1322     } else if (elem_bt == T_INT) {
1323       pminsd(dst, src);
1324     } else {
1325       assert(elem_bt == T_LONG, "required");
1326       assert(tmp == xmm0, "required");
1327       assert_different_registers(dst, src, tmp);
1328       movdqu(xmm0, dst);
1329       pcmpgtq(xmm0, src);
1330       blendvpd(dst, src);  // xmm0 as mask
1331     }
1332   } else { // opcode == Op_MaxV
1333     if (elem_bt == T_BYTE) {
1334       pmaxsb(dst, src);
1335     } else if (elem_bt == T_SHORT) {
1336       pmaxsw(dst, src);
1337     } else if (elem_bt == T_INT) {
1338       pmaxsd(dst, src);
1339     } else {
1340       assert(elem_bt == T_LONG, "required");
1341       assert(tmp == xmm0, "required");
1342       assert_different_registers(dst, src, tmp);
1343       movdqu(xmm0, src);
1344       pcmpgtq(xmm0, dst);
1345       blendvpd(dst, src);  // xmm0 as mask
1346     }
1347   }
1348 }
1349 
1350 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1351                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1352                                  int vlen_enc) {
1353   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1354 
1355   if (opcode == Op_MinV) {
1356     if (elem_bt == T_BYTE) {
1357       vpminsb(dst, src1, src2, vlen_enc);
1358     } else if (elem_bt == T_SHORT) {
1359       vpminsw(dst, src1, src2, vlen_enc);
1360     } else if (elem_bt == T_INT) {
1361       vpminsd(dst, src1, src2, vlen_enc);
1362     } else {
1363       assert(elem_bt == T_LONG, "required");
1364       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1365         vpminsq(dst, src1, src2, vlen_enc);
1366       } else {
1367         assert_different_registers(dst, src1, src2);
1368         vpcmpgtq(dst, src1, src2, vlen_enc);
1369         vblendvpd(dst, src1, src2, dst, vlen_enc);
1370       }
1371     }
1372   } else { // opcode == Op_MaxV
1373     if (elem_bt == T_BYTE) {
1374       vpmaxsb(dst, src1, src2, vlen_enc);
1375     } else if (elem_bt == T_SHORT) {
1376       vpmaxsw(dst, src1, src2, vlen_enc);
1377     } else if (elem_bt == T_INT) {
1378       vpmaxsd(dst, src1, src2, vlen_enc);
1379     } else {
1380       assert(elem_bt == T_LONG, "required");
1381       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1382         vpmaxsq(dst, src1, src2, vlen_enc);
1383       } else {
1384         assert_different_registers(dst, src1, src2);
1385         vpcmpgtq(dst, src1, src2, vlen_enc);
1386         vblendvpd(dst, src2, src1, dst, vlen_enc);
1387       }
1388     }
1389   }
1390 }
1391 
1392 // Float/Double min max
1393 
1394 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1395                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1396                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1397                                    int vlen_enc) {
1398   assert(UseAVX > 0, "required");
1399   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1400          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1401   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1402   assert_different_registers(a, tmp, atmp, btmp);
1403   assert_different_registers(b, tmp, atmp, btmp);
1404 
1405   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1406   bool is_double_word = is_double_word_type(elem_bt);
1407 
1408   /* Note on 'non-obvious' assembly sequence:
1409    *
1410    * While there are vminps/vmaxps instructions, there are two important differences between hardware
1411    * and Java on how they handle floats:
1412    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
1413    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
1414    *
1415    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
1416    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
1417    *                (only useful when signs differ, noop otherwise)
1418    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
1419 
1420    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
1421    *   btmp = (b < +0.0) ? a : b
1422    *   atmp = (b < +0.0) ? b : a
1423    *   Tmp  = Max_Float(atmp , btmp)
1424    *   Res  = (atmp == NaN) ? atmp : Tmp
1425    */
1426 
1427   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
1428   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
1429   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
1430   XMMRegister mask;
1431 
1432   if (!is_double_word && is_min) {
1433     mask = a;
1434     vblend = &MacroAssembler::vblendvps;
1435     vmaxmin = &MacroAssembler::vminps;
1436     vcmp = &MacroAssembler::vcmpps;
1437   } else if (!is_double_word && !is_min) {
1438     mask = b;
1439     vblend = &MacroAssembler::vblendvps;
1440     vmaxmin = &MacroAssembler::vmaxps;
1441     vcmp = &MacroAssembler::vcmpps;
1442   } else if (is_double_word && is_min) {
1443     mask = a;
1444     vblend = &MacroAssembler::vblendvpd;
1445     vmaxmin = &MacroAssembler::vminpd;
1446     vcmp = &MacroAssembler::vcmppd;
1447   } else {
1448     assert(is_double_word && !is_min, "sanity");
1449     mask = b;
1450     vblend = &MacroAssembler::vblendvpd;
1451     vmaxmin = &MacroAssembler::vmaxpd;
1452     vcmp = &MacroAssembler::vcmppd;
1453   }
1454 
1455   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1456   XMMRegister maxmin, scratch;
1457   if (dst == btmp) {
1458     maxmin = btmp;
1459     scratch = tmp;
1460   } else {
1461     maxmin = tmp;
1462     scratch = btmp;
1463   }
1464 
1465   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1466   if (precompute_mask && !is_double_word) {
1467     vpsrad(tmp, mask, 32, vlen_enc);
1468     mask = tmp;
1469   } else if (precompute_mask && is_double_word) {
1470     vpxor(tmp, tmp, tmp, vlen_enc);
1471     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1472     mask = tmp;
1473   }
1474 
1475   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1476   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1477   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1478   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1479   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1480 }
1481 
1482 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1483                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1484                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1485                                     int vlen_enc) {
1486   assert(UseAVX > 2, "required");
1487   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1488          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1489   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1490   assert_different_registers(dst, a, atmp, btmp);
1491   assert_different_registers(dst, b, atmp, btmp);
1492 
1493   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1494   bool is_double_word = is_double_word_type(elem_bt);
1495   bool merge = true;
1496 
1497   if (!is_double_word && is_min) {
1498     evpmovd2m(ktmp, a, vlen_enc);
1499     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1500     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1501     vminps(dst, atmp, btmp, vlen_enc);
1502     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1503     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1504   } else if (!is_double_word && !is_min) {
1505     evpmovd2m(ktmp, b, vlen_enc);
1506     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1507     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1508     vmaxps(dst, atmp, btmp, vlen_enc);
1509     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1510     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1511   } else if (is_double_word && is_min) {
1512     evpmovq2m(ktmp, a, vlen_enc);
1513     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1514     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1515     vminpd(dst, atmp, btmp, vlen_enc);
1516     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1517     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1518   } else {
1519     assert(is_double_word && !is_min, "sanity");
1520     evpmovq2m(ktmp, b, vlen_enc);
1521     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1522     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1523     vmaxpd(dst, atmp, btmp, vlen_enc);
1524     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1525     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1526   }
1527 }
1528 
1529 // Float/Double signum
1530 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1531   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1532 
1533   Label DONE_LABEL;
1534 
1535   if (opcode == Op_SignumF) {
1536     assert(UseSSE > 0, "required");
1537     ucomiss(dst, zero);
1538     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1539     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1540     movflt(dst, one);
1541     jcc(Assembler::above, DONE_LABEL);
1542     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1543   } else if (opcode == Op_SignumD) {
1544     assert(UseSSE > 1, "required");
1545     ucomisd(dst, zero);
1546     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1547     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1548     movdbl(dst, one);
1549     jcc(Assembler::above, DONE_LABEL);
1550     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1551   }
1552 
1553   bind(DONE_LABEL);
1554 }
1555 
1556 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1557   if (sign) {
1558     pmovsxbw(dst, src);
1559   } else {
1560     pmovzxbw(dst, src);
1561   }
1562 }
1563 
1564 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1565   if (sign) {
1566     vpmovsxbw(dst, src, vector_len);
1567   } else {
1568     vpmovzxbw(dst, src, vector_len);
1569   }
1570 }
1571 
1572 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1573   if (sign) {
1574     vpmovsxbd(dst, src, vector_len);
1575   } else {
1576     vpmovzxbd(dst, src, vector_len);
1577   }
1578 }
1579 
1580 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1581   if (sign) {
1582     vpmovsxwd(dst, src, vector_len);
1583   } else {
1584     vpmovzxwd(dst, src, vector_len);
1585   }
1586 }
1587 
1588 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1589                                      int shift, int vector_len) {
1590   if (opcode == Op_RotateLeftV) {
1591     if (etype == T_INT) {
1592       evprold(dst, src, shift, vector_len);
1593     } else {
1594       assert(etype == T_LONG, "expected type T_LONG");
1595       evprolq(dst, src, shift, vector_len);
1596     }
1597   } else {
1598     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1599     if (etype == T_INT) {
1600       evprord(dst, src, shift, vector_len);
1601     } else {
1602       assert(etype == T_LONG, "expected type T_LONG");
1603       evprorq(dst, src, shift, vector_len);
1604     }
1605   }
1606 }
1607 
1608 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1609                                      XMMRegister shift, int vector_len) {
1610   if (opcode == Op_RotateLeftV) {
1611     if (etype == T_INT) {
1612       evprolvd(dst, src, shift, vector_len);
1613     } else {
1614       assert(etype == T_LONG, "expected type T_LONG");
1615       evprolvq(dst, src, shift, vector_len);
1616     }
1617   } else {
1618     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1619     if (etype == T_INT) {
1620       evprorvd(dst, src, shift, vector_len);
1621     } else {
1622       assert(etype == T_LONG, "expected type T_LONG");
1623       evprorvq(dst, src, shift, vector_len);
1624     }
1625   }
1626 }
1627 
1628 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1629   if (opcode == Op_RShiftVI) {
1630     psrad(dst, shift);
1631   } else if (opcode == Op_LShiftVI) {
1632     pslld(dst, shift);
1633   } else {
1634     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1635     psrld(dst, shift);
1636   }
1637 }
1638 
1639 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1640   switch (opcode) {
1641     case Op_RShiftVI:  psrad(dst, shift); break;
1642     case Op_LShiftVI:  pslld(dst, shift); break;
1643     case Op_URShiftVI: psrld(dst, shift); break;
1644 
1645     default: assert(false, "%s", NodeClassNames[opcode]);
1646   }
1647 }
1648 
1649 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1650   if (opcode == Op_RShiftVI) {
1651     vpsrad(dst, nds, shift, vector_len);
1652   } else if (opcode == Op_LShiftVI) {
1653     vpslld(dst, nds, shift, vector_len);
1654   } else {
1655     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1656     vpsrld(dst, nds, shift, vector_len);
1657   }
1658 }
1659 
1660 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1661   switch (opcode) {
1662     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1663     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1664     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1665 
1666     default: assert(false, "%s", NodeClassNames[opcode]);
1667   }
1668 }
1669 
1670 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1671   switch (opcode) {
1672     case Op_RShiftVB:  // fall-through
1673     case Op_RShiftVS:  psraw(dst, shift); break;
1674 
1675     case Op_LShiftVB:  // fall-through
1676     case Op_LShiftVS:  psllw(dst, shift);   break;
1677 
1678     case Op_URShiftVS: // fall-through
1679     case Op_URShiftVB: psrlw(dst, shift);  break;
1680 
1681     default: assert(false, "%s", NodeClassNames[opcode]);
1682   }
1683 }
1684 
1685 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1686   switch (opcode) {
1687     case Op_RShiftVB:  // fall-through
1688     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1689 
1690     case Op_LShiftVB:  // fall-through
1691     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1692 
1693     case Op_URShiftVS: // fall-through
1694     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1695 
1696     default: assert(false, "%s", NodeClassNames[opcode]);
1697   }
1698 }
1699 
1700 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1701   switch (opcode) {
1702     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1703     case Op_LShiftVL:  psllq(dst, shift); break;
1704     case Op_URShiftVL: psrlq(dst, shift); break;
1705 
1706     default: assert(false, "%s", NodeClassNames[opcode]);
1707   }
1708 }
1709 
1710 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1711   if (opcode == Op_RShiftVL) {
1712     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1713   } else if (opcode == Op_LShiftVL) {
1714     psllq(dst, shift);
1715   } else {
1716     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1717     psrlq(dst, shift);
1718   }
1719 }
1720 
1721 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1722   switch (opcode) {
1723     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1724     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1725     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1726 
1727     default: assert(false, "%s", NodeClassNames[opcode]);
1728   }
1729 }
1730 
1731 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1732   if (opcode == Op_RShiftVL) {
1733     evpsraq(dst, nds, shift, vector_len);
1734   } else if (opcode == Op_LShiftVL) {
1735     vpsllq(dst, nds, shift, vector_len);
1736   } else {
1737     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1738     vpsrlq(dst, nds, shift, vector_len);
1739   }
1740 }
1741 
1742 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1743   switch (opcode) {
1744     case Op_RShiftVB:  // fall-through
1745     case Op_RShiftVS:  // fall-through
1746     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1747 
1748     case Op_LShiftVB:  // fall-through
1749     case Op_LShiftVS:  // fall-through
1750     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1751 
1752     case Op_URShiftVB: // fall-through
1753     case Op_URShiftVS: // fall-through
1754     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1755 
1756     default: assert(false, "%s", NodeClassNames[opcode]);
1757   }
1758 }
1759 
1760 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1761   switch (opcode) {
1762     case Op_RShiftVB:  // fall-through
1763     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1764 
1765     case Op_LShiftVB:  // fall-through
1766     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1767 
1768     case Op_URShiftVB: // fall-through
1769     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1770 
1771     default: assert(false, "%s", NodeClassNames[opcode]);
1772   }
1773 }
1774 
1775 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1776   assert(UseAVX >= 2, "required");
1777   switch (opcode) {
1778     case Op_RShiftVL: {
1779       if (UseAVX > 2) {
1780         assert(tmp == xnoreg, "not used");
1781         if (!VM_Version::supports_avx512vl()) {
1782           vlen_enc = Assembler::AVX_512bit;
1783         }
1784         evpsravq(dst, src, shift, vlen_enc);
1785       } else {
1786         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1787         vpsrlvq(dst, src, shift, vlen_enc);
1788         vpsrlvq(tmp, tmp, shift, vlen_enc);
1789         vpxor(dst, dst, tmp, vlen_enc);
1790         vpsubq(dst, dst, tmp, vlen_enc);
1791       }
1792       break;
1793     }
1794     case Op_LShiftVL: {
1795       assert(tmp == xnoreg, "not used");
1796       vpsllvq(dst, src, shift, vlen_enc);
1797       break;
1798     }
1799     case Op_URShiftVL: {
1800       assert(tmp == xnoreg, "not used");
1801       vpsrlvq(dst, src, shift, vlen_enc);
1802       break;
1803     }
1804     default: assert(false, "%s", NodeClassNames[opcode]);
1805   }
1806 }
1807 
1808 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1809 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1810   assert(opcode == Op_LShiftVB ||
1811          opcode == Op_RShiftVB ||
1812          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1813   bool sign = (opcode != Op_URShiftVB);
1814   assert(vector_len == 0, "required");
1815   vextendbd(sign, dst, src, 1);
1816   vpmovzxbd(vtmp, shift, 1);
1817   varshiftd(opcode, dst, dst, vtmp, 1);
1818   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1819   vextracti128_high(vtmp, dst);
1820   vpackusdw(dst, dst, vtmp, 0);
1821 }
1822 
1823 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1824 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1825   assert(opcode == Op_LShiftVB ||
1826          opcode == Op_RShiftVB ||
1827          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1828   bool sign = (opcode != Op_URShiftVB);
1829   int ext_vector_len = vector_len + 1;
1830   vextendbw(sign, dst, src, ext_vector_len);
1831   vpmovzxbw(vtmp, shift, ext_vector_len);
1832   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1833   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1834   if (vector_len == 0) {
1835     vextracti128_high(vtmp, dst);
1836     vpackuswb(dst, dst, vtmp, vector_len);
1837   } else {
1838     vextracti64x4_high(vtmp, dst);
1839     vpackuswb(dst, dst, vtmp, vector_len);
1840     vpermq(dst, dst, 0xD8, vector_len);
1841   }
1842 }
1843 
1844 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1845   switch(typ) {
1846     case T_BYTE:
1847       pinsrb(dst, val, idx);
1848       break;
1849     case T_SHORT:
1850       pinsrw(dst, val, idx);
1851       break;
1852     case T_INT:
1853       pinsrd(dst, val, idx);
1854       break;
1855     case T_LONG:
1856       pinsrq(dst, val, idx);
1857       break;
1858     default:
1859       assert(false,"Should not reach here.");
1860       break;
1861   }
1862 }
1863 
1864 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1865   switch(typ) {
1866     case T_BYTE:
1867       vpinsrb(dst, src, val, idx);
1868       break;
1869     case T_SHORT:
1870       vpinsrw(dst, src, val, idx);
1871       break;
1872     case T_INT:
1873       vpinsrd(dst, src, val, idx);
1874       break;
1875     case T_LONG:
1876       vpinsrq(dst, src, val, idx);
1877       break;
1878     default:
1879       assert(false,"Should not reach here.");
1880       break;
1881   }
1882 }
1883 
1884 #ifdef _LP64
1885 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt,
1886                                                 XMMRegister dst, Register base,
1887                                                 Register idx_base,
1888                                                 Register offset, Register mask,
1889                                                 Register mask_idx, Register rtmp,
1890                                                 int vlen_enc) {
1891   vpxor(dst, dst, dst, vlen_enc);
1892   if (elem_bt == T_SHORT) {
1893     for (int i = 0; i < 4; i++) {
1894       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1895       Label skip_load;
1896       btq(mask, mask_idx);
1897       jccb(Assembler::carryClear, skip_load);
1898       movl(rtmp, Address(idx_base, i * 4));
1899       if (offset != noreg) {
1900         addl(rtmp, offset);
1901       }
1902       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1903       bind(skip_load);
1904       incq(mask_idx);
1905     }
1906   } else {
1907     assert(elem_bt == T_BYTE, "");
1908     for (int i = 0; i < 8; i++) {
1909       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1910       Label skip_load;
1911       btq(mask, mask_idx);
1912       jccb(Assembler::carryClear, skip_load);
1913       movl(rtmp, Address(idx_base, i * 4));
1914       if (offset != noreg) {
1915         addl(rtmp, offset);
1916       }
1917       pinsrb(dst, Address(base, rtmp), i);
1918       bind(skip_load);
1919       incq(mask_idx);
1920     }
1921   }
1922 }
1923 #endif // _LP64
1924 
1925 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst,
1926                                          Register base, Register idx_base,
1927                                          Register offset, Register rtmp,
1928                                          int vlen_enc) {
1929   vpxor(dst, dst, dst, vlen_enc);
1930   if (elem_bt == T_SHORT) {
1931     for (int i = 0; i < 4; i++) {
1932       // dst[i] = src[offset + idx_base[i]]
1933       movl(rtmp, Address(idx_base, i * 4));
1934       if (offset != noreg) {
1935         addl(rtmp, offset);
1936       }
1937       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1938     }
1939   } else {
1940     assert(elem_bt == T_BYTE, "");
1941     for (int i = 0; i < 8; i++) {
1942       // dst[i] = src[offset + idx_base[i]]
1943       movl(rtmp, Address(idx_base, i * 4));
1944       if (offset != noreg) {
1945         addl(rtmp, offset);
1946       }
1947       pinsrb(dst, Address(base, rtmp), i);
1948     }
1949   }
1950 }
1951 
1952 /*
1953  * Gather using hybrid algorithm, first partially unroll scalar loop
1954  * to accumulate values from gather indices into a quad-word(64bit) slice.
1955  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1956  * permutation to place the slice into appropriate vector lane
1957  * locations in destination vector. Following pseudo code describes the
1958  * algorithm in detail:
1959  *
1960  * DST_VEC = ZERO_VEC
1961  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1962  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1963  * FOREACH_ITER:
1964  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1965  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1966  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1967  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1968  *
1969  * With each iteration, doubleword permute indices (0,1) corresponding
1970  * to gathered quadword gets right shifted by two lane positions.
1971  *
1972  */
1973 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1974                                         Register base, Register idx_base,
1975                                         Register offset, Register mask,
1976                                         XMMRegister xtmp1, XMMRegister xtmp2,
1977                                         XMMRegister temp_dst, Register rtmp,
1978                                         Register mask_idx, Register length,
1979                                         int vector_len, int vlen_enc) {
1980   Label GATHER8_LOOP;
1981   assert(is_subword_type(elem_ty), "");
1982   movl(length, vector_len);
1983   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1984   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1985   vallones(xtmp2, vlen_enc);
1986   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1987   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1988   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1989 
1990   bind(GATHER8_LOOP);
1991     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1992     if (mask == noreg) {
1993       vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc);
1994     } else {
1995       LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc));
1996     }
1997     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1998     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1999     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
2000     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
2001     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
2002     vpor(dst, dst, temp_dst, vlen_enc);
2003     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
2004     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
2005     jcc(Assembler::notEqual, GATHER8_LOOP);
2006 }
2007 
2008 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
2009   switch(typ) {
2010     case T_INT:
2011       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
2012       break;
2013     case T_FLOAT:
2014       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
2015       break;
2016     case T_LONG:
2017       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
2018       break;
2019     case T_DOUBLE:
2020       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
2021       break;
2022     default:
2023       assert(false,"Should not reach here.");
2024       break;
2025   }
2026 }
2027 
2028 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
2029   switch(typ) {
2030     case T_INT:
2031       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
2032       break;
2033     case T_FLOAT:
2034       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
2035       break;
2036     case T_LONG:
2037       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
2038       break;
2039     case T_DOUBLE:
2040       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
2041       break;
2042     default:
2043       assert(false,"Should not reach here.");
2044       break;
2045   }
2046 }
2047 
2048 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
2049   switch(typ) {
2050     case T_INT:
2051       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
2052       break;
2053     case T_FLOAT:
2054       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
2055       break;
2056     case T_LONG:
2057       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
2058       break;
2059     case T_DOUBLE:
2060       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
2061       break;
2062     default:
2063       assert(false,"Should not reach here.");
2064       break;
2065   }
2066 }
2067 
2068 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
2069   if (vlen_in_bytes <= 16) {
2070     pxor (dst, dst);
2071     psubb(dst, src);
2072     switch (elem_bt) {
2073       case T_BYTE:   /* nothing to do */ break;
2074       case T_SHORT:  pmovsxbw(dst, dst); break;
2075       case T_INT:    pmovsxbd(dst, dst); break;
2076       case T_FLOAT:  pmovsxbd(dst, dst); break;
2077       case T_LONG:   pmovsxbq(dst, dst); break;
2078       case T_DOUBLE: pmovsxbq(dst, dst); break;
2079 
2080       default: assert(false, "%s", type2name(elem_bt));
2081     }
2082   } else {
2083     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
2084     int vlen_enc = vector_length_encoding(vlen_in_bytes);
2085 
2086     vpxor (dst, dst, dst, vlen_enc);
2087     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
2088 
2089     switch (elem_bt) {
2090       case T_BYTE:   /* nothing to do */            break;
2091       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
2092       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
2093       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
2094       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
2095       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
2096 
2097       default: assert(false, "%s", type2name(elem_bt));
2098     }
2099   }
2100 }
2101 
2102 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
2103   if (novlbwdq) {
2104     vpmovsxbd(xtmp, src, vlen_enc);
2105     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
2106             Assembler::eq, true, vlen_enc, noreg);
2107   } else {
2108     vpxor(xtmp, xtmp, xtmp, vlen_enc);
2109     vpsubb(xtmp, xtmp, src, vlen_enc);
2110     evpmovb2m(dst, xtmp, vlen_enc);
2111   }
2112 }
2113 
2114 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
2115   switch (vlen_in_bytes) {
2116     case 4:  movdl(dst, src);   break;
2117     case 8:  movq(dst, src);    break;
2118     case 16: movdqu(dst, src);  break;
2119     case 32: vmovdqu(dst, src); break;
2120     case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
2121     default: ShouldNotReachHere();
2122   }
2123 }
2124 
2125 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
2126   assert(rscratch != noreg || always_reachable(src), "missing");
2127 
2128   if (reachable(src)) {
2129     load_vector(dst, as_Address(src), vlen_in_bytes);
2130   } else {
2131     lea(rscratch, src);
2132     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
2133   }
2134 }
2135 
2136 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
2137   int vlen_enc = vector_length_encoding(vlen);
2138   if (VM_Version::supports_avx()) {
2139     if (bt == T_LONG) {
2140       if (VM_Version::supports_avx2()) {
2141         vpbroadcastq(dst, src, vlen_enc);
2142       } else {
2143         vmovddup(dst, src, vlen_enc);
2144       }
2145     } else if (bt == T_DOUBLE) {
2146       if (vlen_enc != Assembler::AVX_128bit) {
2147         vbroadcastsd(dst, src, vlen_enc, noreg);
2148       } else {
2149         vmovddup(dst, src, vlen_enc);
2150       }
2151     } else {
2152       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
2153         vpbroadcastd(dst, src, vlen_enc);
2154       } else {
2155         vbroadcastss(dst, src, vlen_enc);
2156       }
2157     }
2158   } else if (VM_Version::supports_sse3()) {
2159     movddup(dst, src);
2160   } else {
2161     movq(dst, src);
2162     if (vlen == 16) {
2163       punpcklqdq(dst, dst);
2164     }
2165   }
2166 }
2167 
2168 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
2169   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
2170   int offset = exact_log2(type2aelembytes(bt)) << 6;
2171   if (is_floating_point_type(bt)) {
2172     offset += 128;
2173   }
2174   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
2175   load_vector(dst, addr, vlen_in_bytes);
2176 }
2177 
2178 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
2179 
2180 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
2181   int vector_len = Assembler::AVX_128bit;
2182 
2183   switch (opcode) {
2184     case Op_AndReductionV:  pand(dst, src); break;
2185     case Op_OrReductionV:   por (dst, src); break;
2186     case Op_XorReductionV:  pxor(dst, src); break;
2187     case Op_MinReductionV:
2188       switch (typ) {
2189         case T_BYTE:        pminsb(dst, src); break;
2190         case T_SHORT:       pminsw(dst, src); break;
2191         case T_INT:         pminsd(dst, src); break;
2192         case T_LONG:        assert(UseAVX > 2, "required");
2193                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
2194         default:            assert(false, "wrong type");
2195       }
2196       break;
2197     case Op_MaxReductionV:
2198       switch (typ) {
2199         case T_BYTE:        pmaxsb(dst, src); break;
2200         case T_SHORT:       pmaxsw(dst, src); break;
2201         case T_INT:         pmaxsd(dst, src); break;
2202         case T_LONG:        assert(UseAVX > 2, "required");
2203                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
2204         default:            assert(false, "wrong type");
2205       }
2206       break;
2207     case Op_AddReductionVF: addss(dst, src); break;
2208     case Op_AddReductionVD: addsd(dst, src); break;
2209     case Op_AddReductionVI:
2210       switch (typ) {
2211         case T_BYTE:        paddb(dst, src); break;
2212         case T_SHORT:       paddw(dst, src); break;
2213         case T_INT:         paddd(dst, src); break;
2214         default:            assert(false, "wrong type");
2215       }
2216       break;
2217     case Op_AddReductionVL: paddq(dst, src); break;
2218     case Op_MulReductionVF: mulss(dst, src); break;
2219     case Op_MulReductionVD: mulsd(dst, src); break;
2220     case Op_MulReductionVI:
2221       switch (typ) {
2222         case T_SHORT:       pmullw(dst, src); break;
2223         case T_INT:         pmulld(dst, src); break;
2224         default:            assert(false, "wrong type");
2225       }
2226       break;
2227     case Op_MulReductionVL: assert(UseAVX > 2, "required");
2228                             evpmullq(dst, dst, src, vector_len); break;
2229     default:                assert(false, "wrong opcode");
2230   }
2231 }
2232 
2233 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
2234   int vector_len = Assembler::AVX_256bit;
2235 
2236   switch (opcode) {
2237     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
2238     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
2239     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
2240     case Op_MinReductionV:
2241       switch (typ) {
2242         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
2243         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
2244         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
2245         case T_LONG:        assert(UseAVX > 2, "required");
2246                             vpminsq(dst, src1, src2, vector_len); break;
2247         default:            assert(false, "wrong type");
2248       }
2249       break;
2250     case Op_MaxReductionV:
2251       switch (typ) {
2252         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
2253         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
2254         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
2255         case T_LONG:        assert(UseAVX > 2, "required");
2256                             vpmaxsq(dst, src1, src2, vector_len); break;
2257         default:            assert(false, "wrong type");
2258       }
2259       break;
2260     case Op_AddReductionVI:
2261       switch (typ) {
2262         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
2263         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
2264         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
2265         default:            assert(false, "wrong type");
2266       }
2267       break;
2268     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
2269     case Op_MulReductionVI:
2270       switch (typ) {
2271         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
2272         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
2273         default:            assert(false, "wrong type");
2274       }
2275       break;
2276     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
2277     default:                assert(false, "wrong opcode");
2278   }
2279 }
2280 
2281 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
2282                                   XMMRegister dst, XMMRegister src,
2283                                   XMMRegister vtmp1, XMMRegister vtmp2) {
2284   switch (opcode) {
2285     case Op_AddReductionVF:
2286     case Op_MulReductionVF:
2287       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
2288       break;
2289 
2290     case Op_AddReductionVD:
2291     case Op_MulReductionVD:
2292       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
2293       break;
2294 
2295     default: assert(false, "wrong opcode");
2296   }
2297 }
2298 
2299 void C2_MacroAssembler::reduceB(int opcode, int vlen,
2300                              Register dst, Register src1, XMMRegister src2,
2301                              XMMRegister vtmp1, XMMRegister vtmp2) {
2302   switch (vlen) {
2303     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2304     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2305     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2306     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2307 
2308     default: assert(false, "wrong vector length");
2309   }
2310 }
2311 
2312 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
2313                              Register dst, Register src1, XMMRegister src2,
2314                              XMMRegister vtmp1, XMMRegister vtmp2) {
2315   switch (vlen) {
2316     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2317     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2318     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2319     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2320 
2321     default: assert(false, "wrong vector length");
2322   }
2323 }
2324 
2325 void C2_MacroAssembler::reduceS(int opcode, int vlen,
2326                              Register dst, Register src1, XMMRegister src2,
2327                              XMMRegister vtmp1, XMMRegister vtmp2) {
2328   switch (vlen) {
2329     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2330     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2331     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2332     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2333 
2334     default: assert(false, "wrong vector length");
2335   }
2336 }
2337 
2338 void C2_MacroAssembler::reduceI(int opcode, int vlen,
2339                              Register dst, Register src1, XMMRegister src2,
2340                              XMMRegister vtmp1, XMMRegister vtmp2) {
2341   switch (vlen) {
2342     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2343     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2344     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2345     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2346 
2347     default: assert(false, "wrong vector length");
2348   }
2349 }
2350 
2351 #ifdef _LP64
2352 void C2_MacroAssembler::reduceL(int opcode, int vlen,
2353                              Register dst, Register src1, XMMRegister src2,
2354                              XMMRegister vtmp1, XMMRegister vtmp2) {
2355   switch (vlen) {
2356     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2357     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2358     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2359 
2360     default: assert(false, "wrong vector length");
2361   }
2362 }
2363 #endif // _LP64
2364 
2365 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2366   switch (vlen) {
2367     case 2:
2368       assert(vtmp2 == xnoreg, "");
2369       reduce2F(opcode, dst, src, vtmp1);
2370       break;
2371     case 4:
2372       assert(vtmp2 == xnoreg, "");
2373       reduce4F(opcode, dst, src, vtmp1);
2374       break;
2375     case 8:
2376       reduce8F(opcode, dst, src, vtmp1, vtmp2);
2377       break;
2378     case 16:
2379       reduce16F(opcode, dst, src, vtmp1, vtmp2);
2380       break;
2381     default: assert(false, "wrong vector length");
2382   }
2383 }
2384 
2385 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2386   switch (vlen) {
2387     case 2:
2388       assert(vtmp2 == xnoreg, "");
2389       reduce2D(opcode, dst, src, vtmp1);
2390       break;
2391     case 4:
2392       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2393       break;
2394     case 8:
2395       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2396       break;
2397     default: assert(false, "wrong vector length");
2398   }
2399 }
2400 
2401 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2402   if (opcode == Op_AddReductionVI) {
2403     if (vtmp1 != src2) {
2404       movdqu(vtmp1, src2);
2405     }
2406     phaddd(vtmp1, vtmp1);
2407   } else {
2408     pshufd(vtmp1, src2, 0x1);
2409     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2410   }
2411   movdl(vtmp2, src1);
2412   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2413   movdl(dst, vtmp1);
2414 }
2415 
2416 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2417   if (opcode == Op_AddReductionVI) {
2418     if (vtmp1 != src2) {
2419       movdqu(vtmp1, src2);
2420     }
2421     phaddd(vtmp1, src2);
2422     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2423   } else {
2424     pshufd(vtmp2, src2, 0xE);
2425     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2426     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2427   }
2428 }
2429 
2430 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2431   if (opcode == Op_AddReductionVI) {
2432     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2433     vextracti128_high(vtmp2, vtmp1);
2434     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2435     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2436   } else {
2437     vextracti128_high(vtmp1, src2);
2438     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2439     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2440   }
2441 }
2442 
2443 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2444   vextracti64x4_high(vtmp2, src2);
2445   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2446   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2447 }
2448 
2449 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2450   pshufd(vtmp2, src2, 0x1);
2451   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2452   movdqu(vtmp1, vtmp2);
2453   psrldq(vtmp1, 2);
2454   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2455   movdqu(vtmp2, vtmp1);
2456   psrldq(vtmp2, 1);
2457   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2458   movdl(vtmp2, src1);
2459   pmovsxbd(vtmp1, vtmp1);
2460   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2461   pextrb(dst, vtmp1, 0x0);
2462   movsbl(dst, dst);
2463 }
2464 
2465 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2466   pshufd(vtmp1, src2, 0xE);
2467   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2468   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2469 }
2470 
2471 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2472   vextracti128_high(vtmp2, src2);
2473   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2474   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2475 }
2476 
2477 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2478   vextracti64x4_high(vtmp1, src2);
2479   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2480   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2481 }
2482 
2483 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2484   pmovsxbw(vtmp2, src2);
2485   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2486 }
2487 
2488 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2489   if (UseAVX > 1) {
2490     int vector_len = Assembler::AVX_256bit;
2491     vpmovsxbw(vtmp1, src2, vector_len);
2492     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2493   } else {
2494     pmovsxbw(vtmp2, src2);
2495     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2496     pshufd(vtmp2, src2, 0x1);
2497     pmovsxbw(vtmp2, src2);
2498     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2499   }
2500 }
2501 
2502 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2503   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2504     int vector_len = Assembler::AVX_512bit;
2505     vpmovsxbw(vtmp1, src2, vector_len);
2506     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2507   } else {
2508     assert(UseAVX >= 2,"Should not reach here.");
2509     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2510     vextracti128_high(vtmp2, src2);
2511     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2512   }
2513 }
2514 
2515 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2516   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2517   vextracti64x4_high(vtmp2, src2);
2518   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2519 }
2520 
2521 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2522   if (opcode == Op_AddReductionVI) {
2523     if (vtmp1 != src2) {
2524       movdqu(vtmp1, src2);
2525     }
2526     phaddw(vtmp1, vtmp1);
2527     phaddw(vtmp1, vtmp1);
2528   } else {
2529     pshufd(vtmp2, src2, 0x1);
2530     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2531     movdqu(vtmp1, vtmp2);
2532     psrldq(vtmp1, 2);
2533     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2534   }
2535   movdl(vtmp2, src1);
2536   pmovsxwd(vtmp1, vtmp1);
2537   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2538   pextrw(dst, vtmp1, 0x0);
2539   movswl(dst, dst);
2540 }
2541 
2542 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2543   if (opcode == Op_AddReductionVI) {
2544     if (vtmp1 != src2) {
2545       movdqu(vtmp1, src2);
2546     }
2547     phaddw(vtmp1, src2);
2548   } else {
2549     pshufd(vtmp1, src2, 0xE);
2550     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2551   }
2552   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2553 }
2554 
2555 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2556   if (opcode == Op_AddReductionVI) {
2557     int vector_len = Assembler::AVX_256bit;
2558     vphaddw(vtmp2, src2, src2, vector_len);
2559     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2560   } else {
2561     vextracti128_high(vtmp2, src2);
2562     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2563   }
2564   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2565 }
2566 
2567 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2568   int vector_len = Assembler::AVX_256bit;
2569   vextracti64x4_high(vtmp1, src2);
2570   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2571   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2572 }
2573 
2574 #ifdef _LP64
2575 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2576   pshufd(vtmp2, src2, 0xE);
2577   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2578   movdq(vtmp1, src1);
2579   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2580   movdq(dst, vtmp1);
2581 }
2582 
2583 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2584   vextracti128_high(vtmp1, src2);
2585   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2586   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2587 }
2588 
2589 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2590   vextracti64x4_high(vtmp2, src2);
2591   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2592   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2593 }
2594 
2595 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2596   mov64(temp, -1L);
2597   bzhiq(temp, temp, len);
2598   kmovql(dst, temp);
2599 }
2600 #endif // _LP64
2601 
2602 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2603   reduce_operation_128(T_FLOAT, opcode, dst, src);
2604   pshufd(vtmp, src, 0x1);
2605   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2606 }
2607 
2608 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2609   reduce2F(opcode, dst, src, vtmp);
2610   pshufd(vtmp, src, 0x2);
2611   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2612   pshufd(vtmp, src, 0x3);
2613   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2614 }
2615 
2616 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2617   reduce4F(opcode, dst, src, vtmp2);
2618   vextractf128_high(vtmp2, src);
2619   reduce4F(opcode, dst, vtmp2, vtmp1);
2620 }
2621 
2622 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2623   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2624   vextracti64x4_high(vtmp1, src);
2625   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2626 }
2627 
2628 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2629   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2630   pshufd(vtmp, src, 0xE);
2631   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2632 }
2633 
2634 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2635   reduce2D(opcode, dst, src, vtmp2);
2636   vextractf128_high(vtmp2, src);
2637   reduce2D(opcode, dst, vtmp2, vtmp1);
2638 }
2639 
2640 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2641   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2642   vextracti64x4_high(vtmp1, src);
2643   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2644 }
2645 
2646 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2647   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2648 }
2649 
2650 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2651   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2652 }
2653 
2654 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2655                                  int vec_enc) {
2656   switch(elem_bt) {
2657     case T_INT:
2658     case T_FLOAT:
2659       vmaskmovps(dst, src, mask, vec_enc);
2660       break;
2661     case T_LONG:
2662     case T_DOUBLE:
2663       vmaskmovpd(dst, src, mask, vec_enc);
2664       break;
2665     default:
2666       fatal("Unsupported type %s", type2name(elem_bt));
2667       break;
2668   }
2669 }
2670 
2671 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2672                                  int vec_enc) {
2673   switch(elem_bt) {
2674     case T_INT:
2675     case T_FLOAT:
2676       vmaskmovps(dst, src, mask, vec_enc);
2677       break;
2678     case T_LONG:
2679     case T_DOUBLE:
2680       vmaskmovpd(dst, src, mask, vec_enc);
2681       break;
2682     default:
2683       fatal("Unsupported type %s", type2name(elem_bt));
2684       break;
2685   }
2686 }
2687 
2688 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2689                                           XMMRegister dst, XMMRegister src,
2690                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2691                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2692   const int permconst[] = {1, 14};
2693   XMMRegister wsrc = src;
2694   XMMRegister wdst = xmm_0;
2695   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2696 
2697   int vlen_enc = Assembler::AVX_128bit;
2698   if (vlen == 16) {
2699     vlen_enc = Assembler::AVX_256bit;
2700   }
2701 
2702   for (int i = log2(vlen) - 1; i >=0; i--) {
2703     if (i == 0 && !is_dst_valid) {
2704       wdst = dst;
2705     }
2706     if (i == 3) {
2707       vextracti64x4_high(wtmp, wsrc);
2708     } else if (i == 2) {
2709       vextracti128_high(wtmp, wsrc);
2710     } else { // i = [0,1]
2711       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2712     }
2713     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2714     wsrc = wdst;
2715     vlen_enc = Assembler::AVX_128bit;
2716   }
2717   if (is_dst_valid) {
2718     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2719   }
2720 }
2721 
2722 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2723                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2724                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2725   XMMRegister wsrc = src;
2726   XMMRegister wdst = xmm_0;
2727   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2728   int vlen_enc = Assembler::AVX_128bit;
2729   if (vlen == 8) {
2730     vlen_enc = Assembler::AVX_256bit;
2731   }
2732   for (int i = log2(vlen) - 1; i >=0; i--) {
2733     if (i == 0 && !is_dst_valid) {
2734       wdst = dst;
2735     }
2736     if (i == 1) {
2737       vextracti128_high(wtmp, wsrc);
2738     } else if (i == 2) {
2739       vextracti64x4_high(wtmp, wsrc);
2740     } else {
2741       assert(i == 0, "%d", i);
2742       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2743     }
2744     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2745     wsrc = wdst;
2746     vlen_enc = Assembler::AVX_128bit;
2747   }
2748   if (is_dst_valid) {
2749     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2750   }
2751 }
2752 
2753 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2754   switch (bt) {
2755     case T_BYTE:  pextrb(dst, src, idx); break;
2756     case T_SHORT: pextrw(dst, src, idx); break;
2757     case T_INT:   pextrd(dst, src, idx); break;
2758     case T_LONG:  pextrq(dst, src, idx); break;
2759 
2760     default:
2761       assert(false,"Should not reach here.");
2762       break;
2763   }
2764 }
2765 
2766 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2767   int esize =  type2aelembytes(typ);
2768   int elem_per_lane = 16/esize;
2769   int lane = elemindex / elem_per_lane;
2770   int eindex = elemindex % elem_per_lane;
2771 
2772   if (lane >= 2) {
2773     assert(UseAVX > 2, "required");
2774     vextractf32x4(dst, src, lane & 3);
2775     return dst;
2776   } else if (lane > 0) {
2777     assert(UseAVX > 0, "required");
2778     vextractf128(dst, src, lane);
2779     return dst;
2780   } else {
2781     return src;
2782   }
2783 }
2784 
2785 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2786   if (typ == T_BYTE) {
2787     movsbl(dst, dst);
2788   } else if (typ == T_SHORT) {
2789     movswl(dst, dst);
2790   }
2791 }
2792 
2793 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2794   int esize =  type2aelembytes(typ);
2795   int elem_per_lane = 16/esize;
2796   int eindex = elemindex % elem_per_lane;
2797   assert(is_integral_type(typ),"required");
2798 
2799   if (eindex == 0) {
2800     if (typ == T_LONG) {
2801       movq(dst, src);
2802     } else {
2803       movdl(dst, src);
2804       movsxl(typ, dst);
2805     }
2806   } else {
2807     extract(typ, dst, src, eindex);
2808     movsxl(typ, dst);
2809   }
2810 }
2811 
2812 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2813   int esize =  type2aelembytes(typ);
2814   int elem_per_lane = 16/esize;
2815   int eindex = elemindex % elem_per_lane;
2816   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2817 
2818   if (eindex == 0) {
2819     movq(dst, src);
2820   } else {
2821     if (typ == T_FLOAT) {
2822       if (UseAVX == 0) {
2823         movdqu(dst, src);
2824         shufps(dst, dst, eindex);
2825       } else {
2826         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2827       }
2828     } else {
2829       if (UseAVX == 0) {
2830         movdqu(dst, src);
2831         psrldq(dst, eindex*esize);
2832       } else {
2833         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2834       }
2835       movq(dst, dst);
2836     }
2837   }
2838   // Zero upper bits
2839   if (typ == T_FLOAT) {
2840     if (UseAVX == 0) {
2841       assert(vtmp != xnoreg, "required.");
2842       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2843       pand(dst, vtmp);
2844     } else {
2845       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2846     }
2847   }
2848 }
2849 
2850 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2851   switch(typ) {
2852     case T_BYTE:
2853     case T_BOOLEAN:
2854       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2855       break;
2856     case T_SHORT:
2857     case T_CHAR:
2858       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2859       break;
2860     case T_INT:
2861     case T_FLOAT:
2862       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2863       break;
2864     case T_LONG:
2865     case T_DOUBLE:
2866       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2867       break;
2868     default:
2869       assert(false,"Should not reach here.");
2870       break;
2871   }
2872 }
2873 
2874 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2875   assert(rscratch != noreg || always_reachable(src2), "missing");
2876 
2877   switch(typ) {
2878     case T_BOOLEAN:
2879     case T_BYTE:
2880       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2881       break;
2882     case T_CHAR:
2883     case T_SHORT:
2884       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2885       break;
2886     case T_INT:
2887     case T_FLOAT:
2888       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2889       break;
2890     case T_LONG:
2891     case T_DOUBLE:
2892       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2893       break;
2894     default:
2895       assert(false,"Should not reach here.");
2896       break;
2897   }
2898 }
2899 
2900 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2901   switch(typ) {
2902     case T_BYTE:
2903       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2904       break;
2905     case T_SHORT:
2906       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2907       break;
2908     case T_INT:
2909     case T_FLOAT:
2910       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2911       break;
2912     case T_LONG:
2913     case T_DOUBLE:
2914       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2915       break;
2916     default:
2917       assert(false,"Should not reach here.");
2918       break;
2919   }
2920 }
2921 
2922 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2923   assert(vlen_in_bytes <= 32, "");
2924   int esize = type2aelembytes(bt);
2925   if (vlen_in_bytes == 32) {
2926     assert(vtmp == xnoreg, "required.");
2927     if (esize >= 4) {
2928       vtestps(src1, src2, AVX_256bit);
2929     } else {
2930       vptest(src1, src2, AVX_256bit);
2931     }
2932     return;
2933   }
2934   if (vlen_in_bytes < 16) {
2935     // Duplicate the lower part to fill the whole register,
2936     // Don't need to do so for src2
2937     assert(vtmp != xnoreg, "required");
2938     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2939     pshufd(vtmp, src1, shuffle_imm);
2940   } else {
2941     assert(vtmp == xnoreg, "required");
2942     vtmp = src1;
2943   }
2944   if (esize >= 4 && VM_Version::supports_avx()) {
2945     vtestps(vtmp, src2, AVX_128bit);
2946   } else {
2947     ptest(vtmp, src2);
2948   }
2949 }
2950 
2951 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2952   assert(UseAVX >= 2, "required");
2953 #ifdef ASSERT
2954   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2955   bool is_bw_supported = VM_Version::supports_avx512bw();
2956   if (is_bw && !is_bw_supported) {
2957     assert(vlen_enc != Assembler::AVX_512bit, "required");
2958     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2959            "XMM register should be 0-15");
2960   }
2961 #endif // ASSERT
2962   switch (elem_bt) {
2963     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2964     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2965     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2966     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2967     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2968     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2969     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2970   }
2971 }
2972 
2973 #ifdef _LP64
2974 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2975   assert(UseAVX >= 2, "required");
2976   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2977   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2978   if ((UseAVX > 2) &&
2979       (!is_bw || VM_Version::supports_avx512bw()) &&
2980       (!is_vl || VM_Version::supports_avx512vl())) {
2981     switch (elem_bt) {
2982       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2983       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2984       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2985       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2986       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2987     }
2988   } else {
2989     assert(vlen_enc != Assembler::AVX_512bit, "required");
2990     assert((dst->encoding() < 16),"XMM register should be 0-15");
2991     switch (elem_bt) {
2992       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2993       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2994       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2995       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2996       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2997       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2998       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2999     }
3000   }
3001 }
3002 #endif
3003 
3004 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
3005   switch (to_elem_bt) {
3006     case T_SHORT:
3007       vpmovsxbw(dst, src, vlen_enc);
3008       break;
3009     case T_INT:
3010       vpmovsxbd(dst, src, vlen_enc);
3011       break;
3012     case T_FLOAT:
3013       vpmovsxbd(dst, src, vlen_enc);
3014       vcvtdq2ps(dst, dst, vlen_enc);
3015       break;
3016     case T_LONG:
3017       vpmovsxbq(dst, src, vlen_enc);
3018       break;
3019     case T_DOUBLE: {
3020       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
3021       vpmovsxbd(dst, src, mid_vlen_enc);
3022       vcvtdq2pd(dst, dst, vlen_enc);
3023       break;
3024     }
3025     default:
3026       fatal("Unsupported type %s", type2name(to_elem_bt));
3027       break;
3028   }
3029 }
3030 
3031 //-------------------------------------------------------------------------------------------
3032 
3033 // IndexOf for constant substrings with size >= 8 chars
3034 // which don't need to be loaded through stack.
3035 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
3036                                          Register cnt1, Register cnt2,
3037                                          int int_cnt2,  Register result,
3038                                          XMMRegister vec, Register tmp,
3039                                          int ae) {
3040   ShortBranchVerifier sbv(this);
3041   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3042   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
3043 
3044   // This method uses the pcmpestri instruction with bound registers
3045   //   inputs:
3046   //     xmm - substring
3047   //     rax - substring length (elements count)
3048   //     mem - scanned string
3049   //     rdx - string length (elements count)
3050   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
3051   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
3052   //   outputs:
3053   //     rcx - matched index in string
3054   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3055   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
3056   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
3057   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
3058   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
3059 
3060   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
3061         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
3062         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
3063 
3064   // Note, inline_string_indexOf() generates checks:
3065   // if (substr.count > string.count) return -1;
3066   // if (substr.count == 0) return 0;
3067   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
3068 
3069   // Load substring.
3070   if (ae == StrIntrinsicNode::UL) {
3071     pmovzxbw(vec, Address(str2, 0));
3072   } else {
3073     movdqu(vec, Address(str2, 0));
3074   }
3075   movl(cnt2, int_cnt2);
3076   movptr(result, str1); // string addr
3077 
3078   if (int_cnt2 > stride) {
3079     jmpb(SCAN_TO_SUBSTR);
3080 
3081     // Reload substr for rescan, this code
3082     // is executed only for large substrings (> 8 chars)
3083     bind(RELOAD_SUBSTR);
3084     if (ae == StrIntrinsicNode::UL) {
3085       pmovzxbw(vec, Address(str2, 0));
3086     } else {
3087       movdqu(vec, Address(str2, 0));
3088     }
3089     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
3090 
3091     bind(RELOAD_STR);
3092     // We came here after the beginning of the substring was
3093     // matched but the rest of it was not so we need to search
3094     // again. Start from the next element after the previous match.
3095 
3096     // cnt2 is number of substring reminding elements and
3097     // cnt1 is number of string reminding elements when cmp failed.
3098     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
3099     subl(cnt1, cnt2);
3100     addl(cnt1, int_cnt2);
3101     movl(cnt2, int_cnt2); // Now restore cnt2
3102 
3103     decrementl(cnt1);     // Shift to next element
3104     cmpl(cnt1, cnt2);
3105     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3106 
3107     addptr(result, (1<<scale1));
3108 
3109   } // (int_cnt2 > 8)
3110 
3111   // Scan string for start of substr in 16-byte vectors
3112   bind(SCAN_TO_SUBSTR);
3113   pcmpestri(vec, Address(result, 0), mode);
3114   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3115   subl(cnt1, stride);
3116   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3117   cmpl(cnt1, cnt2);
3118   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3119   addptr(result, 16);
3120   jmpb(SCAN_TO_SUBSTR);
3121 
3122   // Found a potential substr
3123   bind(FOUND_CANDIDATE);
3124   // Matched whole vector if first element matched (tmp(rcx) == 0).
3125   if (int_cnt2 == stride) {
3126     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
3127   } else { // int_cnt2 > 8
3128     jccb(Assembler::overflow, FOUND_SUBSTR);
3129   }
3130   // After pcmpestri tmp(rcx) contains matched element index
3131   // Compute start addr of substr
3132   lea(result, Address(result, tmp, scale1));
3133 
3134   // Make sure string is still long enough
3135   subl(cnt1, tmp);
3136   cmpl(cnt1, cnt2);
3137   if (int_cnt2 == stride) {
3138     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3139   } else { // int_cnt2 > 8
3140     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
3141   }
3142   // Left less then substring.
3143 
3144   bind(RET_NOT_FOUND);
3145   movl(result, -1);
3146   jmp(EXIT);
3147 
3148   if (int_cnt2 > stride) {
3149     // This code is optimized for the case when whole substring
3150     // is matched if its head is matched.
3151     bind(MATCH_SUBSTR_HEAD);
3152     pcmpestri(vec, Address(result, 0), mode);
3153     // Reload only string if does not match
3154     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
3155 
3156     Label CONT_SCAN_SUBSTR;
3157     // Compare the rest of substring (> 8 chars).
3158     bind(FOUND_SUBSTR);
3159     // First 8 chars are already matched.
3160     negptr(cnt2);
3161     addptr(cnt2, stride);
3162 
3163     bind(SCAN_SUBSTR);
3164     subl(cnt1, stride);
3165     cmpl(cnt2, -stride); // Do not read beyond substring
3166     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
3167     // Back-up strings to avoid reading beyond substring:
3168     // cnt1 = cnt1 - cnt2 + 8
3169     addl(cnt1, cnt2); // cnt2 is negative
3170     addl(cnt1, stride);
3171     movl(cnt2, stride); negptr(cnt2);
3172     bind(CONT_SCAN_SUBSTR);
3173     if (int_cnt2 < (int)G) {
3174       int tail_off1 = int_cnt2<<scale1;
3175       int tail_off2 = int_cnt2<<scale2;
3176       if (ae == StrIntrinsicNode::UL) {
3177         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
3178       } else {
3179         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
3180       }
3181       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
3182     } else {
3183       // calculate index in register to avoid integer overflow (int_cnt2*2)
3184       movl(tmp, int_cnt2);
3185       addptr(tmp, cnt2);
3186       if (ae == StrIntrinsicNode::UL) {
3187         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
3188       } else {
3189         movdqu(vec, Address(str2, tmp, scale2, 0));
3190       }
3191       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
3192     }
3193     // Need to reload strings pointers if not matched whole vector
3194     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3195     addptr(cnt2, stride);
3196     jcc(Assembler::negative, SCAN_SUBSTR);
3197     // Fall through if found full substring
3198 
3199   } // (int_cnt2 > 8)
3200 
3201   bind(RET_FOUND);
3202   // Found result if we matched full small substring.
3203   // Compute substr offset
3204   subptr(result, str1);
3205   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3206     shrl(result, 1); // index
3207   }
3208   bind(EXIT);
3209 
3210 } // string_indexofC8
3211 
3212 // Small strings are loaded through stack if they cross page boundary.
3213 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
3214                                        Register cnt1, Register cnt2,
3215                                        int int_cnt2,  Register result,
3216                                        XMMRegister vec, Register tmp,
3217                                        int ae) {
3218   ShortBranchVerifier sbv(this);
3219   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3220   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
3221 
3222   //
3223   // int_cnt2 is length of small (< 8 chars) constant substring
3224   // or (-1) for non constant substring in which case its length
3225   // is in cnt2 register.
3226   //
3227   // Note, inline_string_indexOf() generates checks:
3228   // if (substr.count > string.count) return -1;
3229   // if (substr.count == 0) return 0;
3230   //
3231   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
3232   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
3233   // This method uses the pcmpestri instruction with bound registers
3234   //   inputs:
3235   //     xmm - substring
3236   //     rax - substring length (elements count)
3237   //     mem - scanned string
3238   //     rdx - string length (elements count)
3239   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
3240   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
3241   //   outputs:
3242   //     rcx - matched index in string
3243   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3244   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
3245   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
3246   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
3247 
3248   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
3249         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
3250         FOUND_CANDIDATE;
3251 
3252   { //========================================================
3253     // We don't know where these strings are located
3254     // and we can't read beyond them. Load them through stack.
3255     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
3256 
3257     movptr(tmp, rsp); // save old SP
3258 
3259     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
3260       if (int_cnt2 == (1>>scale2)) { // One byte
3261         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
3262         load_unsigned_byte(result, Address(str2, 0));
3263         movdl(vec, result); // move 32 bits
3264       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
3265         // Not enough header space in 32-bit VM: 12+3 = 15.
3266         movl(result, Address(str2, -1));
3267         shrl(result, 8);
3268         movdl(vec, result); // move 32 bits
3269       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
3270         load_unsigned_short(result, Address(str2, 0));
3271         movdl(vec, result); // move 32 bits
3272       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
3273         movdl(vec, Address(str2, 0)); // move 32 bits
3274       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
3275         movq(vec, Address(str2, 0));  // move 64 bits
3276       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3277         // Array header size is 12 bytes in 32-bit VM
3278         // + 6 bytes for 3 chars == 18 bytes,
3279         // enough space to load vec and shift.
3280         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3281         if (ae == StrIntrinsicNode::UL) {
3282           int tail_off = int_cnt2-8;
3283           pmovzxbw(vec, Address(str2, tail_off));
3284           psrldq(vec, -2*tail_off);
3285         }
3286         else {
3287           int tail_off = int_cnt2*(1<<scale2);
3288           movdqu(vec, Address(str2, tail_off-16));
3289           psrldq(vec, 16-tail_off);
3290         }
3291       }
3292     } else { // not constant substring
3293       cmpl(cnt2, stride);
3294       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3295 
3296       // We can read beyond string if srt+16 does not cross page boundary
3297       // since heaps are aligned and mapped by pages.
3298       assert(os::vm_page_size() < (int)G, "default page should be small");
3299       movl(result, str2); // We need only low 32 bits
3300       andl(result, ((int)os::vm_page_size()-1));
3301       cmpl(result, ((int)os::vm_page_size()-16));
3302       jccb(Assembler::belowEqual, CHECK_STR);
3303 
3304       // Move small strings to stack to allow load 16 bytes into vec.
3305       subptr(rsp, 16);
3306       int stk_offset = wordSize-(1<<scale2);
3307       push(cnt2);
3308 
3309       bind(COPY_SUBSTR);
3310       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3311         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3312         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3313       } else if (ae == StrIntrinsicNode::UU) {
3314         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3315         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3316       }
3317       decrement(cnt2);
3318       jccb(Assembler::notZero, COPY_SUBSTR);
3319 
3320       pop(cnt2);
3321       movptr(str2, rsp);  // New substring address
3322     } // non constant
3323 
3324     bind(CHECK_STR);
3325     cmpl(cnt1, stride);
3326     jccb(Assembler::aboveEqual, BIG_STRINGS);
3327 
3328     // Check cross page boundary.
3329     movl(result, str1); // We need only low 32 bits
3330     andl(result, ((int)os::vm_page_size()-1));
3331     cmpl(result, ((int)os::vm_page_size()-16));
3332     jccb(Assembler::belowEqual, BIG_STRINGS);
3333 
3334     subptr(rsp, 16);
3335     int stk_offset = -(1<<scale1);
3336     if (int_cnt2 < 0) { // not constant
3337       push(cnt2);
3338       stk_offset += wordSize;
3339     }
3340     movl(cnt2, cnt1);
3341 
3342     bind(COPY_STR);
3343     if (ae == StrIntrinsicNode::LL) {
3344       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3345       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3346     } else {
3347       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3348       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3349     }
3350     decrement(cnt2);
3351     jccb(Assembler::notZero, COPY_STR);
3352 
3353     if (int_cnt2 < 0) { // not constant
3354       pop(cnt2);
3355     }
3356     movptr(str1, rsp);  // New string address
3357 
3358     bind(BIG_STRINGS);
3359     // Load substring.
3360     if (int_cnt2 < 0) { // -1
3361       if (ae == StrIntrinsicNode::UL) {
3362         pmovzxbw(vec, Address(str2, 0));
3363       } else {
3364         movdqu(vec, Address(str2, 0));
3365       }
3366       push(cnt2);       // substr count
3367       push(str2);       // substr addr
3368       push(str1);       // string addr
3369     } else {
3370       // Small (< 8 chars) constant substrings are loaded already.
3371       movl(cnt2, int_cnt2);
3372     }
3373     push(tmp);  // original SP
3374 
3375   } // Finished loading
3376 
3377   //========================================================
3378   // Start search
3379   //
3380 
3381   movptr(result, str1); // string addr
3382 
3383   if (int_cnt2  < 0) {  // Only for non constant substring
3384     jmpb(SCAN_TO_SUBSTR);
3385 
3386     // SP saved at sp+0
3387     // String saved at sp+1*wordSize
3388     // Substr saved at sp+2*wordSize
3389     // Substr count saved at sp+3*wordSize
3390 
3391     // Reload substr for rescan, this code
3392     // is executed only for large substrings (> 8 chars)
3393     bind(RELOAD_SUBSTR);
3394     movptr(str2, Address(rsp, 2*wordSize));
3395     movl(cnt2, Address(rsp, 3*wordSize));
3396     if (ae == StrIntrinsicNode::UL) {
3397       pmovzxbw(vec, Address(str2, 0));
3398     } else {
3399       movdqu(vec, Address(str2, 0));
3400     }
3401     // We came here after the beginning of the substring was
3402     // matched but the rest of it was not so we need to search
3403     // again. Start from the next element after the previous match.
3404     subptr(str1, result); // Restore counter
3405     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3406       shrl(str1, 1);
3407     }
3408     addl(cnt1, str1);
3409     decrementl(cnt1);   // Shift to next element
3410     cmpl(cnt1, cnt2);
3411     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3412 
3413     addptr(result, (1<<scale1));
3414   } // non constant
3415 
3416   // Scan string for start of substr in 16-byte vectors
3417   bind(SCAN_TO_SUBSTR);
3418   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3419   pcmpestri(vec, Address(result, 0), mode);
3420   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3421   subl(cnt1, stride);
3422   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3423   cmpl(cnt1, cnt2);
3424   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3425   addptr(result, 16);
3426 
3427   bind(ADJUST_STR);
3428   cmpl(cnt1, stride); // Do not read beyond string
3429   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3430   // Back-up string to avoid reading beyond string.
3431   lea(result, Address(result, cnt1, scale1, -16));
3432   movl(cnt1, stride);
3433   jmpb(SCAN_TO_SUBSTR);
3434 
3435   // Found a potential substr
3436   bind(FOUND_CANDIDATE);
3437   // After pcmpestri tmp(rcx) contains matched element index
3438 
3439   // Make sure string is still long enough
3440   subl(cnt1, tmp);
3441   cmpl(cnt1, cnt2);
3442   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3443   // Left less then substring.
3444 
3445   bind(RET_NOT_FOUND);
3446   movl(result, -1);
3447   jmp(CLEANUP);
3448 
3449   bind(FOUND_SUBSTR);
3450   // Compute start addr of substr
3451   lea(result, Address(result, tmp, scale1));
3452   if (int_cnt2 > 0) { // Constant substring
3453     // Repeat search for small substring (< 8 chars)
3454     // from new point without reloading substring.
3455     // Have to check that we don't read beyond string.
3456     cmpl(tmp, stride-int_cnt2);
3457     jccb(Assembler::greater, ADJUST_STR);
3458     // Fall through if matched whole substring.
3459   } else { // non constant
3460     assert(int_cnt2 == -1, "should be != 0");
3461 
3462     addl(tmp, cnt2);
3463     // Found result if we matched whole substring.
3464     cmpl(tmp, stride);
3465     jcc(Assembler::lessEqual, RET_FOUND);
3466 
3467     // Repeat search for small substring (<= 8 chars)
3468     // from new point 'str1' without reloading substring.
3469     cmpl(cnt2, stride);
3470     // Have to check that we don't read beyond string.
3471     jccb(Assembler::lessEqual, ADJUST_STR);
3472 
3473     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3474     // Compare the rest of substring (> 8 chars).
3475     movptr(str1, result);
3476 
3477     cmpl(tmp, cnt2);
3478     // First 8 chars are already matched.
3479     jccb(Assembler::equal, CHECK_NEXT);
3480 
3481     bind(SCAN_SUBSTR);
3482     pcmpestri(vec, Address(str1, 0), mode);
3483     // Need to reload strings pointers if not matched whole vector
3484     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3485 
3486     bind(CHECK_NEXT);
3487     subl(cnt2, stride);
3488     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3489     addptr(str1, 16);
3490     if (ae == StrIntrinsicNode::UL) {
3491       addptr(str2, 8);
3492     } else {
3493       addptr(str2, 16);
3494     }
3495     subl(cnt1, stride);
3496     cmpl(cnt2, stride); // Do not read beyond substring
3497     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3498     // Back-up strings to avoid reading beyond substring.
3499 
3500     if (ae == StrIntrinsicNode::UL) {
3501       lea(str2, Address(str2, cnt2, scale2, -8));
3502       lea(str1, Address(str1, cnt2, scale1, -16));
3503     } else {
3504       lea(str2, Address(str2, cnt2, scale2, -16));
3505       lea(str1, Address(str1, cnt2, scale1, -16));
3506     }
3507     subl(cnt1, cnt2);
3508     movl(cnt2, stride);
3509     addl(cnt1, stride);
3510     bind(CONT_SCAN_SUBSTR);
3511     if (ae == StrIntrinsicNode::UL) {
3512       pmovzxbw(vec, Address(str2, 0));
3513     } else {
3514       movdqu(vec, Address(str2, 0));
3515     }
3516     jmp(SCAN_SUBSTR);
3517 
3518     bind(RET_FOUND_LONG);
3519     movptr(str1, Address(rsp, wordSize));
3520   } // non constant
3521 
3522   bind(RET_FOUND);
3523   // Compute substr offset
3524   subptr(result, str1);
3525   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3526     shrl(result, 1); // index
3527   }
3528   bind(CLEANUP);
3529   pop(rsp); // restore SP
3530 
3531 } // string_indexof
3532 
3533 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3534                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3535   ShortBranchVerifier sbv(this);
3536   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3537 
3538   int stride = 8;
3539 
3540   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3541         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3542         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3543         FOUND_SEQ_CHAR, DONE_LABEL;
3544 
3545   movptr(result, str1);
3546   if (UseAVX >= 2) {
3547     cmpl(cnt1, stride);
3548     jcc(Assembler::less, SCAN_TO_CHAR);
3549     cmpl(cnt1, 2*stride);
3550     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3551     movdl(vec1, ch);
3552     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3553     vpxor(vec2, vec2);
3554     movl(tmp, cnt1);
3555     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3556     andl(cnt1,0x0000000F);  //tail count (in chars)
3557 
3558     bind(SCAN_TO_16_CHAR_LOOP);
3559     vmovdqu(vec3, Address(result, 0));
3560     vpcmpeqw(vec3, vec3, vec1, 1);
3561     vptest(vec2, vec3);
3562     jcc(Assembler::carryClear, FOUND_CHAR);
3563     addptr(result, 32);
3564     subl(tmp, 2*stride);
3565     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3566     jmp(SCAN_TO_8_CHAR);
3567     bind(SCAN_TO_8_CHAR_INIT);
3568     movdl(vec1, ch);
3569     pshuflw(vec1, vec1, 0x00);
3570     pshufd(vec1, vec1, 0);
3571     pxor(vec2, vec2);
3572   }
3573   bind(SCAN_TO_8_CHAR);
3574   cmpl(cnt1, stride);
3575   jcc(Assembler::less, SCAN_TO_CHAR);
3576   if (UseAVX < 2) {
3577     movdl(vec1, ch);
3578     pshuflw(vec1, vec1, 0x00);
3579     pshufd(vec1, vec1, 0);
3580     pxor(vec2, vec2);
3581   }
3582   movl(tmp, cnt1);
3583   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3584   andl(cnt1,0x00000007);  //tail count (in chars)
3585 
3586   bind(SCAN_TO_8_CHAR_LOOP);
3587   movdqu(vec3, Address(result, 0));
3588   pcmpeqw(vec3, vec1);
3589   ptest(vec2, vec3);
3590   jcc(Assembler::carryClear, FOUND_CHAR);
3591   addptr(result, 16);
3592   subl(tmp, stride);
3593   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3594   bind(SCAN_TO_CHAR);
3595   testl(cnt1, cnt1);
3596   jcc(Assembler::zero, RET_NOT_FOUND);
3597   bind(SCAN_TO_CHAR_LOOP);
3598   load_unsigned_short(tmp, Address(result, 0));
3599   cmpl(ch, tmp);
3600   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3601   addptr(result, 2);
3602   subl(cnt1, 1);
3603   jccb(Assembler::zero, RET_NOT_FOUND);
3604   jmp(SCAN_TO_CHAR_LOOP);
3605 
3606   bind(RET_NOT_FOUND);
3607   movl(result, -1);
3608   jmpb(DONE_LABEL);
3609 
3610   bind(FOUND_CHAR);
3611   if (UseAVX >= 2) {
3612     vpmovmskb(tmp, vec3);
3613   } else {
3614     pmovmskb(tmp, vec3);
3615   }
3616   bsfl(ch, tmp);
3617   addptr(result, ch);
3618 
3619   bind(FOUND_SEQ_CHAR);
3620   subptr(result, str1);
3621   shrl(result, 1);
3622 
3623   bind(DONE_LABEL);
3624 } // string_indexof_char
3625 
3626 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3627                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3628   ShortBranchVerifier sbv(this);
3629   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3630 
3631   int stride = 16;
3632 
3633   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3634         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3635         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3636         FOUND_SEQ_CHAR, DONE_LABEL;
3637 
3638   movptr(result, str1);
3639   if (UseAVX >= 2) {
3640     cmpl(cnt1, stride);
3641     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3642     cmpl(cnt1, stride*2);
3643     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3644     movdl(vec1, ch);
3645     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3646     vpxor(vec2, vec2);
3647     movl(tmp, cnt1);
3648     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3649     andl(cnt1,0x0000001F);  //tail count (in chars)
3650 
3651     bind(SCAN_TO_32_CHAR_LOOP);
3652     vmovdqu(vec3, Address(result, 0));
3653     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3654     vptest(vec2, vec3);
3655     jcc(Assembler::carryClear, FOUND_CHAR);
3656     addptr(result, 32);
3657     subl(tmp, stride*2);
3658     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3659     jmp(SCAN_TO_16_CHAR);
3660 
3661     bind(SCAN_TO_16_CHAR_INIT);
3662     movdl(vec1, ch);
3663     pxor(vec2, vec2);
3664     pshufb(vec1, vec2);
3665   }
3666 
3667   bind(SCAN_TO_16_CHAR);
3668   cmpl(cnt1, stride);
3669   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3670   if (UseAVX < 2) {
3671     movdl(vec1, ch);
3672     pxor(vec2, vec2);
3673     pshufb(vec1, vec2);
3674   }
3675   movl(tmp, cnt1);
3676   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3677   andl(cnt1,0x0000000F);  //tail count (in bytes)
3678 
3679   bind(SCAN_TO_16_CHAR_LOOP);
3680   movdqu(vec3, Address(result, 0));
3681   pcmpeqb(vec3, vec1);
3682   ptest(vec2, vec3);
3683   jcc(Assembler::carryClear, FOUND_CHAR);
3684   addptr(result, 16);
3685   subl(tmp, stride);
3686   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3687 
3688   bind(SCAN_TO_CHAR_INIT);
3689   testl(cnt1, cnt1);
3690   jcc(Assembler::zero, RET_NOT_FOUND);
3691   bind(SCAN_TO_CHAR_LOOP);
3692   load_unsigned_byte(tmp, Address(result, 0));
3693   cmpl(ch, tmp);
3694   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3695   addptr(result, 1);
3696   subl(cnt1, 1);
3697   jccb(Assembler::zero, RET_NOT_FOUND);
3698   jmp(SCAN_TO_CHAR_LOOP);
3699 
3700   bind(RET_NOT_FOUND);
3701   movl(result, -1);
3702   jmpb(DONE_LABEL);
3703 
3704   bind(FOUND_CHAR);
3705   if (UseAVX >= 2) {
3706     vpmovmskb(tmp, vec3);
3707   } else {
3708     pmovmskb(tmp, vec3);
3709   }
3710   bsfl(ch, tmp);
3711   addptr(result, ch);
3712 
3713   bind(FOUND_SEQ_CHAR);
3714   subptr(result, str1);
3715 
3716   bind(DONE_LABEL);
3717 } // stringL_indexof_char
3718 
3719 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3720   switch (eltype) {
3721   case T_BOOLEAN: return sizeof(jboolean);
3722   case T_BYTE:  return sizeof(jbyte);
3723   case T_SHORT: return sizeof(jshort);
3724   case T_CHAR:  return sizeof(jchar);
3725   case T_INT:   return sizeof(jint);
3726   default:
3727     ShouldNotReachHere();
3728     return -1;
3729   }
3730 }
3731 
3732 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3733   switch (eltype) {
3734   // T_BOOLEAN used as surrogate for unsigned byte
3735   case T_BOOLEAN: movzbl(dst, src);   break;
3736   case T_BYTE:    movsbl(dst, src);   break;
3737   case T_SHORT:   movswl(dst, src);   break;
3738   case T_CHAR:    movzwl(dst, src);   break;
3739   case T_INT:     movl(dst, src);     break;
3740   default:
3741     ShouldNotReachHere();
3742   }
3743 }
3744 
3745 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3746   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3747 }
3748 
3749 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3750   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3751 }
3752 
3753 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3754   const int vlen = Assembler::AVX_256bit;
3755   switch (eltype) {
3756   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3757   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3758   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3759   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3760   case T_INT:
3761     // do nothing
3762     break;
3763   default:
3764     ShouldNotReachHere();
3765   }
3766 }
3767 
3768 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3769                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3770                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3771                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3772                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3773                                         BasicType eltype) {
3774   ShortBranchVerifier sbv(this);
3775   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3776   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3777   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3778 
3779   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3780         SHORT_UNROLLED_LOOP_EXIT,
3781         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3782         UNROLLED_VECTOR_LOOP_BEGIN,
3783         END;
3784   switch (eltype) {
3785   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3786   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3787   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3788   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3789   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3790   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3791   }
3792 
3793   // For "renaming" for readibility of the code
3794   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3795                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3796                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3797 
3798   const int elsize = arrays_hashcode_elsize(eltype);
3799 
3800   /*
3801     if (cnt1 >= 2) {
3802       if (cnt1 >= 32) {
3803         UNROLLED VECTOR LOOP
3804       }
3805       UNROLLED SCALAR LOOP
3806     }
3807     SINGLE SCALAR
3808    */
3809 
3810   cmpl(cnt1, 32);
3811   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3812 
3813   // cnt1 >= 32 && generate_vectorized_loop
3814   xorl(index, index);
3815 
3816   // vresult = IntVector.zero(I256);
3817   for (int idx = 0; idx < 4; idx++) {
3818     vpxor(vresult[idx], vresult[idx]);
3819   }
3820   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3821   Register bound = tmp2;
3822   Register next = tmp3;
3823   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3824   movl(next, Address(tmp2, 0));
3825   movdl(vnext, next);
3826   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3827 
3828   // index = 0;
3829   // bound = cnt1 & ~(32 - 1);
3830   movl(bound, cnt1);
3831   andl(bound, ~(32 - 1));
3832   // for (; index < bound; index += 32) {
3833   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3834   // result *= next;
3835   imull(result, next);
3836   // loop fission to upfront the cost of fetching from memory, OOO execution
3837   // can then hopefully do a better job of prefetching
3838   for (int idx = 0; idx < 4; idx++) {
3839     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3840   }
3841   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3842   for (int idx = 0; idx < 4; idx++) {
3843     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3844     arrays_hashcode_elvcast(vtmp[idx], eltype);
3845     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3846   }
3847   // index += 32;
3848   addl(index, 32);
3849   // index < bound;
3850   cmpl(index, bound);
3851   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3852   // }
3853 
3854   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3855   subl(cnt1, bound);
3856   // release bound
3857 
3858   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3859   for (int idx = 0; idx < 4; idx++) {
3860     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3861     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3862     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3863   }
3864   // result += vresult.reduceLanes(ADD);
3865   for (int idx = 0; idx < 4; idx++) {
3866     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3867   }
3868 
3869   // } else if (cnt1 < 32) {
3870 
3871   bind(SHORT_UNROLLED_BEGIN);
3872   // int i = 1;
3873   movl(index, 1);
3874   cmpl(index, cnt1);
3875   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3876 
3877   // for (; i < cnt1 ; i += 2) {
3878   bind(SHORT_UNROLLED_LOOP_BEGIN);
3879   movl(tmp3, 961);
3880   imull(result, tmp3);
3881   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3882   movl(tmp3, tmp2);
3883   shll(tmp3, 5);
3884   subl(tmp3, tmp2);
3885   addl(result, tmp3);
3886   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3887   addl(result, tmp3);
3888   addl(index, 2);
3889   cmpl(index, cnt1);
3890   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3891 
3892   // }
3893   // if (i >= cnt1) {
3894   bind(SHORT_UNROLLED_LOOP_EXIT);
3895   jccb(Assembler::greater, END);
3896   movl(tmp2, result);
3897   shll(result, 5);
3898   subl(result, tmp2);
3899   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3900   addl(result, tmp3);
3901   // }
3902   bind(END);
3903 
3904   BLOCK_COMMENT("} // arrays_hashcode");
3905 
3906 } // arrays_hashcode
3907 
3908 // helper function for string_compare
3909 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3910                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3911                                            Address::ScaleFactor scale2, Register index, int ae) {
3912   if (ae == StrIntrinsicNode::LL) {
3913     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3914     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3915   } else if (ae == StrIntrinsicNode::UU) {
3916     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3917     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3918   } else {
3919     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3920     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3921   }
3922 }
3923 
3924 // Compare strings, used for char[] and byte[].
3925 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3926                                        Register cnt1, Register cnt2, Register result,
3927                                        XMMRegister vec1, int ae, KRegister mask) {
3928   ShortBranchVerifier sbv(this);
3929   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3930   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3931   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3932   int stride2x2 = 0x40;
3933   Address::ScaleFactor scale = Address::no_scale;
3934   Address::ScaleFactor scale1 = Address::no_scale;
3935   Address::ScaleFactor scale2 = Address::no_scale;
3936 
3937   if (ae != StrIntrinsicNode::LL) {
3938     stride2x2 = 0x20;
3939   }
3940 
3941   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3942     shrl(cnt2, 1);
3943   }
3944   // Compute the minimum of the string lengths and the
3945   // difference of the string lengths (stack).
3946   // Do the conditional move stuff
3947   movl(result, cnt1);
3948   subl(cnt1, cnt2);
3949   push(cnt1);
3950   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3951 
3952   // Is the minimum length zero?
3953   testl(cnt2, cnt2);
3954   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3955   if (ae == StrIntrinsicNode::LL) {
3956     // Load first bytes
3957     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3958     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3959   } else if (ae == StrIntrinsicNode::UU) {
3960     // Load first characters
3961     load_unsigned_short(result, Address(str1, 0));
3962     load_unsigned_short(cnt1, Address(str2, 0));
3963   } else {
3964     load_unsigned_byte(result, Address(str1, 0));
3965     load_unsigned_short(cnt1, Address(str2, 0));
3966   }
3967   subl(result, cnt1);
3968   jcc(Assembler::notZero,  POP_LABEL);
3969 
3970   if (ae == StrIntrinsicNode::UU) {
3971     // Divide length by 2 to get number of chars
3972     shrl(cnt2, 1);
3973   }
3974   cmpl(cnt2, 1);
3975   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3976 
3977   // Check if the strings start at the same location and setup scale and stride
3978   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3979     cmpptr(str1, str2);
3980     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3981     if (ae == StrIntrinsicNode::LL) {
3982       scale = Address::times_1;
3983       stride = 16;
3984     } else {
3985       scale = Address::times_2;
3986       stride = 8;
3987     }
3988   } else {
3989     scale1 = Address::times_1;
3990     scale2 = Address::times_2;
3991     // scale not used
3992     stride = 8;
3993   }
3994 
3995   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3996     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3997     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3998     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3999     Label COMPARE_TAIL_LONG;
4000     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
4001 
4002     int pcmpmask = 0x19;
4003     if (ae == StrIntrinsicNode::LL) {
4004       pcmpmask &= ~0x01;
4005     }
4006 
4007     // Setup to compare 16-chars (32-bytes) vectors,
4008     // start from first character again because it has aligned address.
4009     if (ae == StrIntrinsicNode::LL) {
4010       stride2 = 32;
4011     } else {
4012       stride2 = 16;
4013     }
4014     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4015       adr_stride = stride << scale;
4016     } else {
4017       adr_stride1 = 8;  //stride << scale1;
4018       adr_stride2 = 16; //stride << scale2;
4019     }
4020 
4021     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
4022     // rax and rdx are used by pcmpestri as elements counters
4023     movl(result, cnt2);
4024     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
4025     jcc(Assembler::zero, COMPARE_TAIL_LONG);
4026 
4027     // fast path : compare first 2 8-char vectors.
4028     bind(COMPARE_16_CHARS);
4029     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4030       movdqu(vec1, Address(str1, 0));
4031     } else {
4032       pmovzxbw(vec1, Address(str1, 0));
4033     }
4034     pcmpestri(vec1, Address(str2, 0), pcmpmask);
4035     jccb(Assembler::below, COMPARE_INDEX_CHAR);
4036 
4037     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4038       movdqu(vec1, Address(str1, adr_stride));
4039       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
4040     } else {
4041       pmovzxbw(vec1, Address(str1, adr_stride1));
4042       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
4043     }
4044     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
4045     addl(cnt1, stride);
4046 
4047     // Compare the characters at index in cnt1
4048     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
4049     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
4050     subl(result, cnt2);
4051     jmp(POP_LABEL);
4052 
4053     // Setup the registers to start vector comparison loop
4054     bind(COMPARE_WIDE_VECTORS);
4055     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4056       lea(str1, Address(str1, result, scale));
4057       lea(str2, Address(str2, result, scale));
4058     } else {
4059       lea(str1, Address(str1, result, scale1));
4060       lea(str2, Address(str2, result, scale2));
4061     }
4062     subl(result, stride2);
4063     subl(cnt2, stride2);
4064     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
4065     negptr(result);
4066 
4067     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
4068     bind(COMPARE_WIDE_VECTORS_LOOP);
4069 
4070 #ifdef _LP64
4071     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4072       cmpl(cnt2, stride2x2);
4073       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4074       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
4075       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
4076 
4077       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4078       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4079         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
4080         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
4081       } else {
4082         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
4083         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
4084       }
4085       kortestql(mask, mask);
4086       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
4087       addptr(result, stride2x2);  // update since we already compared at this addr
4088       subl(cnt2, stride2x2);      // and sub the size too
4089       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4090 
4091       vpxor(vec1, vec1);
4092       jmpb(COMPARE_WIDE_TAIL);
4093     }//if (VM_Version::supports_avx512vlbw())
4094 #endif // _LP64
4095 
4096 
4097     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4098     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4099       vmovdqu(vec1, Address(str1, result, scale));
4100       vpxor(vec1, Address(str2, result, scale));
4101     } else {
4102       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
4103       vpxor(vec1, Address(str2, result, scale2));
4104     }
4105     vptest(vec1, vec1);
4106     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
4107     addptr(result, stride2);
4108     subl(cnt2, stride2);
4109     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
4110     // clean upper bits of YMM registers
4111     vpxor(vec1, vec1);
4112 
4113     // compare wide vectors tail
4114     bind(COMPARE_WIDE_TAIL);
4115     testptr(result, result);
4116     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
4117 
4118     movl(result, stride2);
4119     movl(cnt2, result);
4120     negptr(result);
4121     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4122 
4123     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
4124     bind(VECTOR_NOT_EQUAL);
4125     // clean upper bits of YMM registers
4126     vpxor(vec1, vec1);
4127     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4128       lea(str1, Address(str1, result, scale));
4129       lea(str2, Address(str2, result, scale));
4130     } else {
4131       lea(str1, Address(str1, result, scale1));
4132       lea(str2, Address(str2, result, scale2));
4133     }
4134     jmp(COMPARE_16_CHARS);
4135 
4136     // Compare tail chars, length between 1 to 15 chars
4137     bind(COMPARE_TAIL_LONG);
4138     movl(cnt2, result);
4139     cmpl(cnt2, stride);
4140     jcc(Assembler::less, COMPARE_SMALL_STR);
4141 
4142     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4143       movdqu(vec1, Address(str1, 0));
4144     } else {
4145       pmovzxbw(vec1, Address(str1, 0));
4146     }
4147     pcmpestri(vec1, Address(str2, 0), pcmpmask);
4148     jcc(Assembler::below, COMPARE_INDEX_CHAR);
4149     subptr(cnt2, stride);
4150     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
4151     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4152       lea(str1, Address(str1, result, scale));
4153       lea(str2, Address(str2, result, scale));
4154     } else {
4155       lea(str1, Address(str1, result, scale1));
4156       lea(str2, Address(str2, result, scale2));
4157     }
4158     negptr(cnt2);
4159     jmpb(WHILE_HEAD_LABEL);
4160 
4161     bind(COMPARE_SMALL_STR);
4162   } else if (UseSSE42Intrinsics) {
4163     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
4164     int pcmpmask = 0x19;
4165     // Setup to compare 8-char (16-byte) vectors,
4166     // start from first character again because it has aligned address.
4167     movl(result, cnt2);
4168     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
4169     if (ae == StrIntrinsicNode::LL) {
4170       pcmpmask &= ~0x01;
4171     }
4172     jcc(Assembler::zero, COMPARE_TAIL);
4173     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4174       lea(str1, Address(str1, result, scale));
4175       lea(str2, Address(str2, result, scale));
4176     } else {
4177       lea(str1, Address(str1, result, scale1));
4178       lea(str2, Address(str2, result, scale2));
4179     }
4180     negptr(result);
4181 
4182     // pcmpestri
4183     //   inputs:
4184     //     vec1- substring
4185     //     rax - negative string length (elements count)
4186     //     mem - scanned string
4187     //     rdx - string length (elements count)
4188     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
4189     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
4190     //   outputs:
4191     //     rcx - first mismatched element index
4192     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
4193 
4194     bind(COMPARE_WIDE_VECTORS);
4195     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4196       movdqu(vec1, Address(str1, result, scale));
4197       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
4198     } else {
4199       pmovzxbw(vec1, Address(str1, result, scale1));
4200       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
4201     }
4202     // After pcmpestri cnt1(rcx) contains mismatched element index
4203 
4204     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
4205     addptr(result, stride);
4206     subptr(cnt2, stride);
4207     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4208 
4209     // compare wide vectors tail
4210     testptr(result, result);
4211     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
4212 
4213     movl(cnt2, stride);
4214     movl(result, stride);
4215     negptr(result);
4216     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4217       movdqu(vec1, Address(str1, result, scale));
4218       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
4219     } else {
4220       pmovzxbw(vec1, Address(str1, result, scale1));
4221       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
4222     }
4223     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
4224 
4225     // Mismatched characters in the vectors
4226     bind(VECTOR_NOT_EQUAL);
4227     addptr(cnt1, result);
4228     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
4229     subl(result, cnt2);
4230     jmpb(POP_LABEL);
4231 
4232     bind(COMPARE_TAIL); // limit is zero
4233     movl(cnt2, result);
4234     // Fallthru to tail compare
4235   }
4236   // Shift str2 and str1 to the end of the arrays, negate min
4237   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4238     lea(str1, Address(str1, cnt2, scale));
4239     lea(str2, Address(str2, cnt2, scale));
4240   } else {
4241     lea(str1, Address(str1, cnt2, scale1));
4242     lea(str2, Address(str2, cnt2, scale2));
4243   }
4244   decrementl(cnt2);  // first character was compared already
4245   negptr(cnt2);
4246 
4247   // Compare the rest of the elements
4248   bind(WHILE_HEAD_LABEL);
4249   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
4250   subl(result, cnt1);
4251   jccb(Assembler::notZero, POP_LABEL);
4252   increment(cnt2);
4253   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
4254 
4255   // Strings are equal up to min length.  Return the length difference.
4256   bind(LENGTH_DIFF_LABEL);
4257   pop(result);
4258   if (ae == StrIntrinsicNode::UU) {
4259     // Divide diff by 2 to get number of chars
4260     sarl(result, 1);
4261   }
4262   jmpb(DONE_LABEL);
4263 
4264 #ifdef _LP64
4265   if (VM_Version::supports_avx512vlbw()) {
4266 
4267     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
4268 
4269     kmovql(cnt1, mask);
4270     notq(cnt1);
4271     bsfq(cnt2, cnt1);
4272     if (ae != StrIntrinsicNode::LL) {
4273       // Divide diff by 2 to get number of chars
4274       sarl(cnt2, 1);
4275     }
4276     addq(result, cnt2);
4277     if (ae == StrIntrinsicNode::LL) {
4278       load_unsigned_byte(cnt1, Address(str2, result));
4279       load_unsigned_byte(result, Address(str1, result));
4280     } else if (ae == StrIntrinsicNode::UU) {
4281       load_unsigned_short(cnt1, Address(str2, result, scale));
4282       load_unsigned_short(result, Address(str1, result, scale));
4283     } else {
4284       load_unsigned_short(cnt1, Address(str2, result, scale2));
4285       load_unsigned_byte(result, Address(str1, result, scale1));
4286     }
4287     subl(result, cnt1);
4288     jmpb(POP_LABEL);
4289   }//if (VM_Version::supports_avx512vlbw())
4290 #endif // _LP64
4291 
4292   // Discard the stored length difference
4293   bind(POP_LABEL);
4294   pop(cnt1);
4295 
4296   // That's it
4297   bind(DONE_LABEL);
4298   if(ae == StrIntrinsicNode::UL) {
4299     negl(result);
4300   }
4301 
4302 }
4303 
4304 // Search for Non-ASCII character (Negative byte value) in a byte array,
4305 // return the index of the first such character, otherwise the length
4306 // of the array segment searched.
4307 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4308 //   @IntrinsicCandidate
4309 //   public static int countPositives(byte[] ba, int off, int len) {
4310 //     for (int i = off; i < off + len; i++) {
4311 //       if (ba[i] < 0) {
4312 //         return i - off;
4313 //       }
4314 //     }
4315 //     return len;
4316 //   }
4317 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4318   Register result, Register tmp1,
4319   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4320   // rsi: byte array
4321   // rcx: len
4322   // rax: result
4323   ShortBranchVerifier sbv(this);
4324   assert_different_registers(ary1, len, result, tmp1);
4325   assert_different_registers(vec1, vec2);
4326   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4327 
4328   movl(result, len); // copy
4329   // len == 0
4330   testl(len, len);
4331   jcc(Assembler::zero, DONE);
4332 
4333   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4334     VM_Version::supports_avx512vlbw() &&
4335     VM_Version::supports_bmi2()) {
4336 
4337     Label test_64_loop, test_tail, BREAK_LOOP;
4338     movl(tmp1, len);
4339     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4340 
4341     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4342     andl(len,  0xffffffc0); // vector count (in chars)
4343     jccb(Assembler::zero, test_tail);
4344 
4345     lea(ary1, Address(ary1, len, Address::times_1));
4346     negptr(len);
4347 
4348     bind(test_64_loop);
4349     // Check whether our 64 elements of size byte contain negatives
4350     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4351     kortestql(mask1, mask1);
4352     jcc(Assembler::notZero, BREAK_LOOP);
4353 
4354     addptr(len, 64);
4355     jccb(Assembler::notZero, test_64_loop);
4356 
4357     bind(test_tail);
4358     // bail out when there is nothing to be done
4359     testl(tmp1, -1);
4360     jcc(Assembler::zero, DONE);
4361 
4362 
4363     // check the tail for absense of negatives
4364     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4365 #ifdef _LP64
4366     {
4367       Register tmp3_aliased = len;
4368       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4369       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4370       notq(tmp3_aliased);
4371       kmovql(mask2, tmp3_aliased);
4372     }
4373 #else
4374     Label k_init;
4375     jmp(k_init);
4376 
4377     // We could not read 64-bits from a general purpose register thus we move
4378     // data required to compose 64 1's to the instruction stream
4379     // We emit 64 byte wide series of elements from 0..63 which later on would
4380     // be used as a compare targets with tail count contained in tmp1 register.
4381     // Result would be a k register having tmp1 consecutive number or 1
4382     // counting from least significant bit.
4383     address tmp = pc();
4384     emit_int64(0x0706050403020100);
4385     emit_int64(0x0F0E0D0C0B0A0908);
4386     emit_int64(0x1716151413121110);
4387     emit_int64(0x1F1E1D1C1B1A1918);
4388     emit_int64(0x2726252423222120);
4389     emit_int64(0x2F2E2D2C2B2A2928);
4390     emit_int64(0x3736353433323130);
4391     emit_int64(0x3F3E3D3C3B3A3938);
4392 
4393     bind(k_init);
4394     lea(len, InternalAddress(tmp));
4395     // create mask to test for negative byte inside a vector
4396     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
4397     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
4398 
4399 #endif
4400     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4401     ktestq(mask1, mask2);
4402     jcc(Assembler::zero, DONE);
4403 
4404     // do a full check for negative registers in the tail
4405     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4406                      // ary1 already pointing to the right place
4407     jmpb(TAIL_START);
4408 
4409     bind(BREAK_LOOP);
4410     // At least one byte in the last 64 byte block was negative.
4411     // Set up to look at the last 64 bytes as if they were a tail
4412     lea(ary1, Address(ary1, len, Address::times_1));
4413     addptr(result, len);
4414     // Ignore the very last byte: if all others are positive,
4415     // it must be negative, so we can skip right to the 2+1 byte
4416     // end comparison at this point
4417     orl(result, 63);
4418     movl(len, 63);
4419     // Fallthru to tail compare
4420   } else {
4421 
4422     if (UseAVX >= 2 && UseSSE >= 2) {
4423       // With AVX2, use 32-byte vector compare
4424       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4425 
4426       // Compare 32-byte vectors
4427       testl(len, 0xffffffe0);   // vector count (in bytes)
4428       jccb(Assembler::zero, TAIL_START);
4429 
4430       andl(len, 0xffffffe0);
4431       lea(ary1, Address(ary1, len, Address::times_1));
4432       negptr(len);
4433 
4434       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4435       movdl(vec2, tmp1);
4436       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4437 
4438       bind(COMPARE_WIDE_VECTORS);
4439       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4440       vptest(vec1, vec2);
4441       jccb(Assembler::notZero, BREAK_LOOP);
4442       addptr(len, 32);
4443       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4444 
4445       testl(result, 0x0000001f);   // any bytes remaining?
4446       jcc(Assembler::zero, DONE);
4447 
4448       // Quick test using the already prepared vector mask
4449       movl(len, result);
4450       andl(len, 0x0000001f);
4451       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4452       vptest(vec1, vec2);
4453       jcc(Assembler::zero, DONE);
4454       // There are zeros, jump to the tail to determine exactly where
4455       jmpb(TAIL_START);
4456 
4457       bind(BREAK_LOOP);
4458       // At least one byte in the last 32-byte vector is negative.
4459       // Set up to look at the last 32 bytes as if they were a tail
4460       lea(ary1, Address(ary1, len, Address::times_1));
4461       addptr(result, len);
4462       // Ignore the very last byte: if all others are positive,
4463       // it must be negative, so we can skip right to the 2+1 byte
4464       // end comparison at this point
4465       orl(result, 31);
4466       movl(len, 31);
4467       // Fallthru to tail compare
4468     } else if (UseSSE42Intrinsics) {
4469       // With SSE4.2, use double quad vector compare
4470       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4471 
4472       // Compare 16-byte vectors
4473       testl(len, 0xfffffff0);   // vector count (in bytes)
4474       jcc(Assembler::zero, TAIL_START);
4475 
4476       andl(len, 0xfffffff0);
4477       lea(ary1, Address(ary1, len, Address::times_1));
4478       negptr(len);
4479 
4480       movl(tmp1, 0x80808080);
4481       movdl(vec2, tmp1);
4482       pshufd(vec2, vec2, 0);
4483 
4484       bind(COMPARE_WIDE_VECTORS);
4485       movdqu(vec1, Address(ary1, len, Address::times_1));
4486       ptest(vec1, vec2);
4487       jccb(Assembler::notZero, BREAK_LOOP);
4488       addptr(len, 16);
4489       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4490 
4491       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4492       jcc(Assembler::zero, DONE);
4493 
4494       // Quick test using the already prepared vector mask
4495       movl(len, result);
4496       andl(len, 0x0000000f);   // tail count (in bytes)
4497       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4498       ptest(vec1, vec2);
4499       jcc(Assembler::zero, DONE);
4500       jmpb(TAIL_START);
4501 
4502       bind(BREAK_LOOP);
4503       // At least one byte in the last 16-byte vector is negative.
4504       // Set up and look at the last 16 bytes as if they were a tail
4505       lea(ary1, Address(ary1, len, Address::times_1));
4506       addptr(result, len);
4507       // Ignore the very last byte: if all others are positive,
4508       // it must be negative, so we can skip right to the 2+1 byte
4509       // end comparison at this point
4510       orl(result, 15);
4511       movl(len, 15);
4512       // Fallthru to tail compare
4513     }
4514   }
4515 
4516   bind(TAIL_START);
4517   // Compare 4-byte vectors
4518   andl(len, 0xfffffffc); // vector count (in bytes)
4519   jccb(Assembler::zero, COMPARE_CHAR);
4520 
4521   lea(ary1, Address(ary1, len, Address::times_1));
4522   negptr(len);
4523 
4524   bind(COMPARE_VECTORS);
4525   movl(tmp1, Address(ary1, len, Address::times_1));
4526   andl(tmp1, 0x80808080);
4527   jccb(Assembler::notZero, TAIL_ADJUST);
4528   addptr(len, 4);
4529   jccb(Assembler::notZero, COMPARE_VECTORS);
4530 
4531   // Compare trailing char (final 2-3 bytes), if any
4532   bind(COMPARE_CHAR);
4533 
4534   testl(result, 0x2);   // tail  char
4535   jccb(Assembler::zero, COMPARE_BYTE);
4536   load_unsigned_short(tmp1, Address(ary1, 0));
4537   andl(tmp1, 0x00008080);
4538   jccb(Assembler::notZero, CHAR_ADJUST);
4539   lea(ary1, Address(ary1, 2));
4540 
4541   bind(COMPARE_BYTE);
4542   testl(result, 0x1);   // tail  byte
4543   jccb(Assembler::zero, DONE);
4544   load_unsigned_byte(tmp1, Address(ary1, 0));
4545   testl(tmp1, 0x00000080);
4546   jccb(Assembler::zero, DONE);
4547   subptr(result, 1);
4548   jmpb(DONE);
4549 
4550   bind(TAIL_ADJUST);
4551   // there are negative bits in the last 4 byte block.
4552   // Adjust result and check the next three bytes
4553   addptr(result, len);
4554   orl(result, 3);
4555   lea(ary1, Address(ary1, len, Address::times_1));
4556   jmpb(COMPARE_CHAR);
4557 
4558   bind(CHAR_ADJUST);
4559   // We are looking at a char + optional byte tail, and found that one
4560   // of the bytes in the char is negative. Adjust the result, check the
4561   // first byte and readjust if needed.
4562   andl(result, 0xfffffffc);
4563   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4564   jccb(Assembler::notZero, DONE);
4565   addptr(result, 1);
4566 
4567   // That's it
4568   bind(DONE);
4569   if (UseAVX >= 2 && UseSSE >= 2) {
4570     // clean upper bits of YMM registers
4571     vpxor(vec1, vec1);
4572     vpxor(vec2, vec2);
4573   }
4574 }
4575 
4576 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4577 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4578                                       Register limit, Register result, Register chr,
4579                                       XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
4580   ShortBranchVerifier sbv(this);
4581   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4582 
4583   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4584   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4585 
4586   if (is_array_equ) {
4587     // Check the input args
4588     cmpoop(ary1, ary2);
4589     jcc(Assembler::equal, TRUE_LABEL);
4590 
4591     // Need additional checks for arrays_equals.
4592     testptr(ary1, ary1);
4593     jcc(Assembler::zero, FALSE_LABEL);
4594     testptr(ary2, ary2);
4595     jcc(Assembler::zero, FALSE_LABEL);
4596 
4597     // Check the lengths
4598     movl(limit, Address(ary1, length_offset));
4599     cmpl(limit, Address(ary2, length_offset));
4600     jcc(Assembler::notEqual, FALSE_LABEL);
4601   }
4602 
4603   // count == 0
4604   testl(limit, limit);
4605   jcc(Assembler::zero, TRUE_LABEL);
4606 
4607   if (is_array_equ) {
4608     // Load array address
4609     lea(ary1, Address(ary1, base_offset));
4610     lea(ary2, Address(ary2, base_offset));
4611   }
4612 
4613   if (is_array_equ && is_char) {
4614     // arrays_equals when used for char[].
4615     shll(limit, 1);      // byte count != 0
4616   }
4617   movl(result, limit); // copy
4618 
4619   if (UseAVX >= 2) {
4620     // With AVX2, use 32-byte vector compare
4621     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4622 
4623     // Compare 32-byte vectors
4624     andl(result, 0x0000001f);  //   tail count (in bytes)
4625     andl(limit, 0xffffffe0);   // vector count (in bytes)
4626     jcc(Assembler::zero, COMPARE_TAIL);
4627 
4628     lea(ary1, Address(ary1, limit, Address::times_1));
4629     lea(ary2, Address(ary2, limit, Address::times_1));
4630     negptr(limit);
4631 
4632 #ifdef _LP64
4633     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4634       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4635 
4636       cmpl(limit, -64);
4637       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4638 
4639       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4640 
4641       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4642       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4643       kortestql(mask, mask);
4644       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4645       addptr(limit, 64);  // update since we already compared at this addr
4646       cmpl(limit, -64);
4647       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4648 
4649       // At this point we may still need to compare -limit+result bytes.
4650       // We could execute the next two instruction and just continue via non-wide path:
4651       //  cmpl(limit, 0);
4652       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4653       // But since we stopped at the points ary{1,2}+limit which are
4654       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4655       // (|limit| <= 32 and result < 32),
4656       // we may just compare the last 64 bytes.
4657       //
4658       addptr(result, -64);   // it is safe, bc we just came from this area
4659       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4660       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4661       kortestql(mask, mask);
4662       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4663 
4664       jmp(TRUE_LABEL);
4665 
4666       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4667 
4668     }//if (VM_Version::supports_avx512vlbw())
4669 #endif //_LP64
4670     bind(COMPARE_WIDE_VECTORS);
4671     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
4672     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4673     vpxor(vec1, vec2);
4674 
4675     vptest(vec1, vec1);
4676     jcc(Assembler::notZero, FALSE_LABEL);
4677     addptr(limit, 32);
4678     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4679 
4680     testl(result, result);
4681     jcc(Assembler::zero, TRUE_LABEL);
4682 
4683     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
4684     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4685     vpxor(vec1, vec2);
4686 
4687     vptest(vec1, vec1);
4688     jccb(Assembler::notZero, FALSE_LABEL);
4689     jmpb(TRUE_LABEL);
4690 
4691     bind(COMPARE_TAIL); // limit is zero
4692     movl(limit, result);
4693     // Fallthru to tail compare
4694   } else if (UseSSE42Intrinsics) {
4695     // With SSE4.2, use double quad vector compare
4696     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4697 
4698     // Compare 16-byte vectors
4699     andl(result, 0x0000000f);  //   tail count (in bytes)
4700     andl(limit, 0xfffffff0);   // vector count (in bytes)
4701     jcc(Assembler::zero, COMPARE_TAIL);
4702 
4703     lea(ary1, Address(ary1, limit, Address::times_1));
4704     lea(ary2, Address(ary2, limit, Address::times_1));
4705     negptr(limit);
4706 
4707     bind(COMPARE_WIDE_VECTORS);
4708     movdqu(vec1, Address(ary1, limit, Address::times_1));
4709     movdqu(vec2, Address(ary2, limit, Address::times_1));
4710     pxor(vec1, vec2);
4711 
4712     ptest(vec1, vec1);
4713     jcc(Assembler::notZero, FALSE_LABEL);
4714     addptr(limit, 16);
4715     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4716 
4717     testl(result, result);
4718     jcc(Assembler::zero, TRUE_LABEL);
4719 
4720     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4721     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4722     pxor(vec1, vec2);
4723 
4724     ptest(vec1, vec1);
4725     jccb(Assembler::notZero, FALSE_LABEL);
4726     jmpb(TRUE_LABEL);
4727 
4728     bind(COMPARE_TAIL); // limit is zero
4729     movl(limit, result);
4730     // Fallthru to tail compare
4731   }
4732 
4733   // Compare 4-byte vectors
4734   andl(limit, 0xfffffffc); // vector count (in bytes)
4735   jccb(Assembler::zero, COMPARE_CHAR);
4736 
4737   lea(ary1, Address(ary1, limit, Address::times_1));
4738   lea(ary2, Address(ary2, limit, Address::times_1));
4739   negptr(limit);
4740 
4741   bind(COMPARE_VECTORS);
4742   movl(chr, Address(ary1, limit, Address::times_1));
4743   cmpl(chr, Address(ary2, limit, Address::times_1));
4744   jccb(Assembler::notEqual, FALSE_LABEL);
4745   addptr(limit, 4);
4746   jcc(Assembler::notZero, COMPARE_VECTORS);
4747 
4748   // Compare trailing char (final 2 bytes), if any
4749   bind(COMPARE_CHAR);
4750   testl(result, 0x2);   // tail  char
4751   jccb(Assembler::zero, COMPARE_BYTE);
4752   load_unsigned_short(chr, Address(ary1, 0));
4753   load_unsigned_short(limit, Address(ary2, 0));
4754   cmpl(chr, limit);
4755   jccb(Assembler::notEqual, FALSE_LABEL);
4756 
4757   if (is_array_equ && is_char) {
4758     bind(COMPARE_BYTE);
4759   } else {
4760     lea(ary1, Address(ary1, 2));
4761     lea(ary2, Address(ary2, 2));
4762 
4763     bind(COMPARE_BYTE);
4764     testl(result, 0x1);   // tail  byte
4765     jccb(Assembler::zero, TRUE_LABEL);
4766     load_unsigned_byte(chr, Address(ary1, 0));
4767     load_unsigned_byte(limit, Address(ary2, 0));
4768     cmpl(chr, limit);
4769     jccb(Assembler::notEqual, FALSE_LABEL);
4770   }
4771   bind(TRUE_LABEL);
4772   movl(result, 1);   // return true
4773   jmpb(DONE);
4774 
4775   bind(FALSE_LABEL);
4776   xorl(result, result); // return false
4777 
4778   // That's it
4779   bind(DONE);
4780   if (UseAVX >= 2) {
4781     // clean upper bits of YMM registers
4782     vpxor(vec1, vec1);
4783     vpxor(vec2, vec2);
4784   }
4785 }
4786 
4787 #ifdef _LP64
4788 
4789 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4790 #define __ masm.
4791   Register dst = stub.data<0>();
4792   XMMRegister src = stub.data<1>();
4793   address target = stub.data<2>();
4794   __ bind(stub.entry());
4795   __ subptr(rsp, 8);
4796   __ movdbl(Address(rsp), src);
4797   __ call(RuntimeAddress(target));
4798   __ pop(dst);
4799   __ jmp(stub.continuation());
4800 #undef __
4801 }
4802 
4803 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4804   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4805   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4806 
4807   address slowpath_target;
4808   if (dst_bt == T_INT) {
4809     if (src_bt == T_FLOAT) {
4810       cvttss2sil(dst, src);
4811       cmpl(dst, 0x80000000);
4812       slowpath_target = StubRoutines::x86::f2i_fixup();
4813     } else {
4814       cvttsd2sil(dst, src);
4815       cmpl(dst, 0x80000000);
4816       slowpath_target = StubRoutines::x86::d2i_fixup();
4817     }
4818   } else {
4819     if (src_bt == T_FLOAT) {
4820       cvttss2siq(dst, src);
4821       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4822       slowpath_target = StubRoutines::x86::f2l_fixup();
4823     } else {
4824       cvttsd2siq(dst, src);
4825       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4826       slowpath_target = StubRoutines::x86::d2l_fixup();
4827     }
4828   }
4829 
4830   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4831   jcc(Assembler::equal, stub->entry());
4832   bind(stub->continuation());
4833 }
4834 
4835 #endif // _LP64
4836 
4837 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4838                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4839   switch(ideal_opc) {
4840     case Op_LShiftVS:
4841       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4842     case Op_LShiftVI:
4843       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4844     case Op_LShiftVL:
4845       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4846     case Op_RShiftVS:
4847       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4848     case Op_RShiftVI:
4849       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4850     case Op_RShiftVL:
4851       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4852     case Op_URShiftVS:
4853       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4854     case Op_URShiftVI:
4855       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4856     case Op_URShiftVL:
4857       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4858     case Op_RotateRightV:
4859       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4860     case Op_RotateLeftV:
4861       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4862     default:
4863       fatal("Unsupported masked operation"); break;
4864   }
4865 }
4866 
4867 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4868                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4869                                     bool is_varshift) {
4870   switch (ideal_opc) {
4871     case Op_AddVB:
4872       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4873     case Op_AddVS:
4874       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4875     case Op_AddVI:
4876       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4877     case Op_AddVL:
4878       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4879     case Op_AddVF:
4880       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4881     case Op_AddVD:
4882       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4883     case Op_SubVB:
4884       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4885     case Op_SubVS:
4886       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4887     case Op_SubVI:
4888       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4889     case Op_SubVL:
4890       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4891     case Op_SubVF:
4892       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4893     case Op_SubVD:
4894       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4895     case Op_MulVS:
4896       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4897     case Op_MulVI:
4898       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4899     case Op_MulVL:
4900       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4901     case Op_MulVF:
4902       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4903     case Op_MulVD:
4904       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4905     case Op_DivVF:
4906       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4907     case Op_DivVD:
4908       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4909     case Op_SqrtVF:
4910       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4911     case Op_SqrtVD:
4912       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4913     case Op_AbsVB:
4914       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4915     case Op_AbsVS:
4916       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4917     case Op_AbsVI:
4918       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4919     case Op_AbsVL:
4920       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4921     case Op_FmaVF:
4922       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4923     case Op_FmaVD:
4924       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4925     case Op_VectorRearrange:
4926       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4927     case Op_LShiftVS:
4928       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4929     case Op_LShiftVI:
4930       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4931     case Op_LShiftVL:
4932       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4933     case Op_RShiftVS:
4934       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4935     case Op_RShiftVI:
4936       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4937     case Op_RShiftVL:
4938       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4939     case Op_URShiftVS:
4940       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4941     case Op_URShiftVI:
4942       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4943     case Op_URShiftVL:
4944       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4945     case Op_RotateLeftV:
4946       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4947     case Op_RotateRightV:
4948       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4949     case Op_MaxV:
4950       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4951     case Op_MinV:
4952       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4953     case Op_XorV:
4954       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4955     case Op_OrV:
4956       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4957     case Op_AndV:
4958       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4959     default:
4960       fatal("Unsupported masked operation"); break;
4961   }
4962 }
4963 
4964 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4965                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4966   switch (ideal_opc) {
4967     case Op_AddVB:
4968       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4969     case Op_AddVS:
4970       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4971     case Op_AddVI:
4972       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4973     case Op_AddVL:
4974       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4975     case Op_AddVF:
4976       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4977     case Op_AddVD:
4978       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4979     case Op_SubVB:
4980       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4981     case Op_SubVS:
4982       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4983     case Op_SubVI:
4984       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4985     case Op_SubVL:
4986       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4987     case Op_SubVF:
4988       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4989     case Op_SubVD:
4990       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4991     case Op_MulVS:
4992       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4993     case Op_MulVI:
4994       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4995     case Op_MulVL:
4996       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4997     case Op_MulVF:
4998       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4999     case Op_MulVD:
5000       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
5001     case Op_DivVF:
5002       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
5003     case Op_DivVD:
5004       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
5005     case Op_FmaVF:
5006       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
5007     case Op_FmaVD:
5008       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
5009     case Op_MaxV:
5010       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5011     case Op_MinV:
5012       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5013     case Op_XorV:
5014       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5015     case Op_OrV:
5016       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5017     case Op_AndV:
5018       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
5019     default:
5020       fatal("Unsupported masked operation"); break;
5021   }
5022 }
5023 
5024 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
5025                                   KRegister src1, KRegister src2) {
5026   BasicType etype = T_ILLEGAL;
5027   switch(mask_len) {
5028     case 2:
5029     case 4:
5030     case 8:  etype = T_BYTE; break;
5031     case 16: etype = T_SHORT; break;
5032     case 32: etype = T_INT; break;
5033     case 64: etype = T_LONG; break;
5034     default: fatal("Unsupported type"); break;
5035   }
5036   assert(etype != T_ILLEGAL, "");
5037   switch(ideal_opc) {
5038     case Op_AndVMask:
5039       kand(etype, dst, src1, src2); break;
5040     case Op_OrVMask:
5041       kor(etype, dst, src1, src2); break;
5042     case Op_XorVMask:
5043       kxor(etype, dst, src1, src2); break;
5044     default:
5045       fatal("Unsupported masked operation"); break;
5046   }
5047 }
5048 
5049 /*
5050  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5051  * If src is NaN, the result is 0.
5052  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
5053  * the result is equal to the value of Integer.MIN_VALUE.
5054  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
5055  * the result is equal to the value of Integer.MAX_VALUE.
5056  */
5057 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5058                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5059                                                                    Register rscratch, AddressLiteral float_sign_flip,
5060                                                                    int vec_enc) {
5061   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5062   Label done;
5063   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
5064   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
5065   vptest(xtmp2, xtmp2, vec_enc);
5066   jccb(Assembler::equal, done);
5067 
5068   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
5069   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
5070 
5071   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5072   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
5073   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
5074 
5075   // Recompute the mask for remaining special value.
5076   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
5077   // Extract SRC values corresponding to TRUE mask lanes.
5078   vpand(xtmp4, xtmp2, src, vec_enc);
5079   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
5080   // values are set.
5081   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
5082 
5083   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
5084   bind(done);
5085 }
5086 
5087 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5088                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5089                                                                     Register rscratch, AddressLiteral float_sign_flip,
5090                                                                     int vec_enc) {
5091   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5092   Label done;
5093   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5094   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5095   kortestwl(ktmp1, ktmp1);
5096   jccb(Assembler::equal, done);
5097 
5098   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5099   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5100   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5101 
5102   kxorwl(ktmp1, ktmp1, ktmp2);
5103   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5104   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5105   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5106   bind(done);
5107 }
5108 
5109 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5110                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5111                                                                      Register rscratch, AddressLiteral double_sign_flip,
5112                                                                      int vec_enc) {
5113   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5114 
5115   Label done;
5116   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5117   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
5118   kortestwl(ktmp1, ktmp1);
5119   jccb(Assembler::equal, done);
5120 
5121   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5122   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5123   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5124 
5125   kxorwl(ktmp1, ktmp1, ktmp2);
5126   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5127   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5128   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5129   bind(done);
5130 }
5131 
5132 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5133                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5134                                                                      Register rscratch, AddressLiteral float_sign_flip,
5135                                                                      int vec_enc) {
5136   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5137   Label done;
5138   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5139   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5140   kortestwl(ktmp1, ktmp1);
5141   jccb(Assembler::equal, done);
5142 
5143   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5144   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5145   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5146 
5147   kxorwl(ktmp1, ktmp1, ktmp2);
5148   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5149   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5150   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5151   bind(done);
5152 }
5153 
5154 /*
5155  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5156  * If src is NaN, the result is 0.
5157  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5158  * the result is equal to the value of Long.MIN_VALUE.
5159  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5160  * the result is equal to the value of Long.MAX_VALUE.
5161  */
5162 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5163                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5164                                                                       Register rscratch, AddressLiteral double_sign_flip,
5165                                                                       int vec_enc) {
5166   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5167 
5168   Label done;
5169   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5170   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5171   kortestwl(ktmp1, ktmp1);
5172   jccb(Assembler::equal, done);
5173 
5174   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5175   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5176   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5177 
5178   kxorwl(ktmp1, ktmp1, ktmp2);
5179   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5180   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5181   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5182   bind(done);
5183 }
5184 
5185 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5186                                                              XMMRegister xtmp, int index, int vec_enc) {
5187    assert(vec_enc < Assembler::AVX_512bit, "");
5188    if (vec_enc == Assembler::AVX_256bit) {
5189      vextractf128_high(xtmp, src);
5190      vshufps(dst, src, xtmp, index, vec_enc);
5191    } else {
5192      vshufps(dst, src, zero, index, vec_enc);
5193    }
5194 }
5195 
5196 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5197                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5198                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5199   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5200 
5201   Label done;
5202   // Compare the destination lanes with float_sign_flip
5203   // value to get mask for all special values.
5204   movdqu(xtmp1, float_sign_flip, rscratch);
5205   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5206   ptest(xtmp2, xtmp2);
5207   jccb(Assembler::equal, done);
5208 
5209   // Flip float_sign_flip to get max integer value.
5210   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5211   pxor(xtmp1, xtmp4);
5212 
5213   // Set detination lanes corresponding to unordered source lanes as zero.
5214   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5215   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5216 
5217   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5218   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5219   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5220 
5221   // Recompute the mask for remaining special value.
5222   pxor(xtmp2, xtmp3);
5223   // Extract mask corresponding to non-negative source lanes.
5224   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5225 
5226   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5227   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5228   pand(xtmp3, xtmp2);
5229 
5230   // Replace destination lanes holding special value(0x80000000) with max int
5231   // if corresponding source lane holds a +ve value.
5232   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5233   bind(done);
5234 }
5235 
5236 
5237 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5238                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5239   switch(to_elem_bt) {
5240     case T_SHORT:
5241       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5242       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5243       vpackusdw(dst, dst, zero, vec_enc);
5244       if (vec_enc == Assembler::AVX_256bit) {
5245         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5246       }
5247       break;
5248     case  T_BYTE:
5249       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5250       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5251       vpackusdw(dst, dst, zero, vec_enc);
5252       if (vec_enc == Assembler::AVX_256bit) {
5253         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5254       }
5255       vpackuswb(dst, dst, zero, vec_enc);
5256       break;
5257     default: assert(false, "%s", type2name(to_elem_bt));
5258   }
5259 }
5260 
5261 /*
5262  * Algorithm for vector D2L and F2I conversions:-
5263  * a) Perform vector D2L/F2I cast.
5264  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5265  *    It signifies that source value could be any of the special floating point
5266  *    values(NaN,-Inf,Inf,Max,-Min).
5267  * c) Set destination to zero if source is NaN value.
5268  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5269  */
5270 
5271 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5272                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5273                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5274   int to_elem_sz = type2aelembytes(to_elem_bt);
5275   assert(to_elem_sz <= 4, "");
5276   vcvttps2dq(dst, src, vec_enc);
5277   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5278   if (to_elem_sz < 4) {
5279     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5280     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5281   }
5282 }
5283 
5284 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5285                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5286                                             Register rscratch, int vec_enc) {
5287   int to_elem_sz = type2aelembytes(to_elem_bt);
5288   assert(to_elem_sz <= 4, "");
5289   vcvttps2dq(dst, src, vec_enc);
5290   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5291   switch(to_elem_bt) {
5292     case T_INT:
5293       break;
5294     case T_SHORT:
5295       evpmovdw(dst, dst, vec_enc);
5296       break;
5297     case T_BYTE:
5298       evpmovdb(dst, dst, vec_enc);
5299       break;
5300     default: assert(false, "%s", type2name(to_elem_bt));
5301   }
5302 }
5303 
5304 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5305                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5306                                             Register rscratch, int vec_enc) {
5307   evcvttps2qq(dst, src, vec_enc);
5308   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5309 }
5310 
5311 // Handling for downcasting from double to integer or sub-word types on AVX2.
5312 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5313                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5314                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5315   int to_elem_sz = type2aelembytes(to_elem_bt);
5316   assert(to_elem_sz < 8, "");
5317   vcvttpd2dq(dst, src, vec_enc);
5318   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5319                                               float_sign_flip, vec_enc);
5320   if (to_elem_sz < 4) {
5321     // xtmp4 holds all zero lanes.
5322     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5323   }
5324 }
5325 
5326 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5327                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5328                                             KRegister ktmp2, AddressLiteral sign_flip,
5329                                             Register rscratch, int vec_enc) {
5330   if (VM_Version::supports_avx512dq()) {
5331     evcvttpd2qq(dst, src, vec_enc);
5332     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5333     switch(to_elem_bt) {
5334       case T_LONG:
5335         break;
5336       case T_INT:
5337         evpmovsqd(dst, dst, vec_enc);
5338         break;
5339       case T_SHORT:
5340         evpmovsqd(dst, dst, vec_enc);
5341         evpmovdw(dst, dst, vec_enc);
5342         break;
5343       case T_BYTE:
5344         evpmovsqd(dst, dst, vec_enc);
5345         evpmovdb(dst, dst, vec_enc);
5346         break;
5347       default: assert(false, "%s", type2name(to_elem_bt));
5348     }
5349   } else {
5350     assert(type2aelembytes(to_elem_bt) <= 4, "");
5351     vcvttpd2dq(dst, src, vec_enc);
5352     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5353     switch(to_elem_bt) {
5354       case T_INT:
5355         break;
5356       case T_SHORT:
5357         evpmovdw(dst, dst, vec_enc);
5358         break;
5359       case T_BYTE:
5360         evpmovdb(dst, dst, vec_enc);
5361         break;
5362       default: assert(false, "%s", type2name(to_elem_bt));
5363     }
5364   }
5365 }
5366 
5367 #ifdef _LP64
5368 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5369                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5370                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5371   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5372   // and re-instantiate original MXCSR.RC mode after that.
5373   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5374 
5375   mov64(tmp, julong_cast(0.5L));
5376   evpbroadcastq(xtmp1, tmp, vec_enc);
5377   vaddpd(xtmp1, src , xtmp1, vec_enc);
5378   evcvtpd2qq(dst, xtmp1, vec_enc);
5379   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5380                                                 double_sign_flip, vec_enc);;
5381 
5382   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5383 }
5384 
5385 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5386                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5387                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5388   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5389   // and re-instantiate original MXCSR.RC mode after that.
5390   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5391 
5392   movl(tmp, jint_cast(0.5));
5393   movq(xtmp1, tmp);
5394   vbroadcastss(xtmp1, xtmp1, vec_enc);
5395   vaddps(xtmp1, src , xtmp1, vec_enc);
5396   vcvtps2dq(dst, xtmp1, vec_enc);
5397   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5398                                               float_sign_flip, vec_enc);
5399 
5400   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5401 }
5402 
5403 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5404                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5405                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5406   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5407   // and re-instantiate original MXCSR.RC mode after that.
5408   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5409 
5410   movl(tmp, jint_cast(0.5));
5411   movq(xtmp1, tmp);
5412   vbroadcastss(xtmp1, xtmp1, vec_enc);
5413   vaddps(xtmp1, src , xtmp1, vec_enc);
5414   vcvtps2dq(dst, xtmp1, vec_enc);
5415   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5416 
5417   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5418 }
5419 #endif // _LP64
5420 
5421 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5422                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5423   switch (from_elem_bt) {
5424     case T_BYTE:
5425       switch (to_elem_bt) {
5426         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5427         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5428         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5429         default: ShouldNotReachHere();
5430       }
5431       break;
5432     case T_SHORT:
5433       switch (to_elem_bt) {
5434         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5435         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5436         default: ShouldNotReachHere();
5437       }
5438       break;
5439     case T_INT:
5440       assert(to_elem_bt == T_LONG, "");
5441       vpmovzxdq(dst, src, vlen_enc);
5442       break;
5443     default:
5444       ShouldNotReachHere();
5445   }
5446 }
5447 
5448 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5449                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5450   switch (from_elem_bt) {
5451     case T_BYTE:
5452       switch (to_elem_bt) {
5453         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5454         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5455         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5456         default: ShouldNotReachHere();
5457       }
5458       break;
5459     case T_SHORT:
5460       switch (to_elem_bt) {
5461         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5462         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5463         default: ShouldNotReachHere();
5464       }
5465       break;
5466     case T_INT:
5467       assert(to_elem_bt == T_LONG, "");
5468       vpmovsxdq(dst, src, vlen_enc);
5469       break;
5470     default:
5471       ShouldNotReachHere();
5472   }
5473 }
5474 
5475 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5476                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5477   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5478   assert(vlen_enc != AVX_512bit, "");
5479 
5480   int dst_bt_size = type2aelembytes(dst_bt);
5481   int src_bt_size = type2aelembytes(src_bt);
5482   if (dst_bt_size > src_bt_size) {
5483     switch (dst_bt_size / src_bt_size) {
5484       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5485       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5486       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5487       default: ShouldNotReachHere();
5488     }
5489   } else {
5490     assert(dst_bt_size < src_bt_size, "");
5491     switch (src_bt_size / dst_bt_size) {
5492       case 2: {
5493         if (vlen_enc == AVX_128bit) {
5494           vpacksswb(dst, src, src, vlen_enc);
5495         } else {
5496           vpacksswb(dst, src, src, vlen_enc);
5497           vpermq(dst, dst, 0x08, vlen_enc);
5498         }
5499         break;
5500       }
5501       case 4: {
5502         if (vlen_enc == AVX_128bit) {
5503           vpackssdw(dst, src, src, vlen_enc);
5504           vpacksswb(dst, dst, dst, vlen_enc);
5505         } else {
5506           vpackssdw(dst, src, src, vlen_enc);
5507           vpermq(dst, dst, 0x08, vlen_enc);
5508           vpacksswb(dst, dst, dst, AVX_128bit);
5509         }
5510         break;
5511       }
5512       case 8: {
5513         if (vlen_enc == AVX_128bit) {
5514           vpshufd(dst, src, 0x08, vlen_enc);
5515           vpackssdw(dst, dst, dst, vlen_enc);
5516           vpacksswb(dst, dst, dst, vlen_enc);
5517         } else {
5518           vpshufd(dst, src, 0x08, vlen_enc);
5519           vpermq(dst, dst, 0x08, vlen_enc);
5520           vpackssdw(dst, dst, dst, AVX_128bit);
5521           vpacksswb(dst, dst, dst, AVX_128bit);
5522         }
5523         break;
5524       }
5525       default: ShouldNotReachHere();
5526     }
5527   }
5528 }
5529 
5530 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5531                                    bool merge, BasicType bt, int vlen_enc) {
5532   if (bt == T_INT) {
5533     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5534   } else {
5535     assert(bt == T_LONG, "");
5536     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5537   }
5538 }
5539 
5540 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5541                                    bool merge, BasicType bt, int vlen_enc) {
5542   if (bt == T_INT) {
5543     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5544   } else {
5545     assert(bt == T_LONG, "");
5546     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5547   }
5548 }
5549 
5550 #ifdef _LP64
5551 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5552                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5553                                                int vec_enc) {
5554   int index = 0;
5555   int vindex = 0;
5556   mov64(rtmp1, 0x0101010101010101L);
5557   pdepq(rtmp1, src, rtmp1);
5558   if (mask_len > 8) {
5559     movq(rtmp2, src);
5560     vpxor(xtmp, xtmp, xtmp, vec_enc);
5561     movq(xtmp, rtmp1);
5562   }
5563   movq(dst, rtmp1);
5564 
5565   mask_len -= 8;
5566   while (mask_len > 0) {
5567     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5568     index++;
5569     if ((index % 2) == 0) {
5570       pxor(xtmp, xtmp);
5571     }
5572     mov64(rtmp1, 0x0101010101010101L);
5573     shrq(rtmp2, 8);
5574     pdepq(rtmp1, rtmp2, rtmp1);
5575     pinsrq(xtmp, rtmp1, index % 2);
5576     vindex = index / 2;
5577     if (vindex) {
5578       // Write entire 16 byte vector when both 64 bit
5579       // lanes are update to save redundant instructions.
5580       if (index % 2) {
5581         vinsertf128(dst, dst, xtmp, vindex);
5582       }
5583     } else {
5584       vmovdqu(dst, xtmp);
5585     }
5586     mask_len -= 8;
5587   }
5588 }
5589 
5590 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5591   switch(opc) {
5592     case Op_VectorMaskTrueCount:
5593       popcntq(dst, tmp);
5594       break;
5595     case Op_VectorMaskLastTrue:
5596       if (VM_Version::supports_lzcnt()) {
5597         lzcntq(tmp, tmp);
5598         movl(dst, 63);
5599         subl(dst, tmp);
5600       } else {
5601         movl(dst, -1);
5602         bsrq(tmp, tmp);
5603         cmov32(Assembler::notZero, dst, tmp);
5604       }
5605       break;
5606     case Op_VectorMaskFirstTrue:
5607       if (VM_Version::supports_bmi1()) {
5608         if (masklen < 32) {
5609           orl(tmp, 1 << masklen);
5610           tzcntl(dst, tmp);
5611         } else if (masklen == 32) {
5612           tzcntl(dst, tmp);
5613         } else {
5614           assert(masklen == 64, "");
5615           tzcntq(dst, tmp);
5616         }
5617       } else {
5618         if (masklen < 32) {
5619           orl(tmp, 1 << masklen);
5620           bsfl(dst, tmp);
5621         } else {
5622           assert(masklen == 32 || masklen == 64, "");
5623           movl(dst, masklen);
5624           if (masklen == 32)  {
5625             bsfl(tmp, tmp);
5626           } else {
5627             bsfq(tmp, tmp);
5628           }
5629           cmov32(Assembler::notZero, dst, tmp);
5630         }
5631       }
5632       break;
5633     case Op_VectorMaskToLong:
5634       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5635       break;
5636     default: assert(false, "Unhandled mask operation");
5637   }
5638 }
5639 
5640 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5641                                               int masklen, int masksize, int vec_enc) {
5642   assert(VM_Version::supports_popcnt(), "");
5643 
5644   if(VM_Version::supports_avx512bw()) {
5645     kmovql(tmp, mask);
5646   } else {
5647     assert(masklen <= 16, "");
5648     kmovwl(tmp, mask);
5649   }
5650 
5651   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5652   // operations needs to be clipped.
5653   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5654     andq(tmp, (1 << masklen) - 1);
5655   }
5656 
5657   vector_mask_operation_helper(opc, dst, tmp, masklen);
5658 }
5659 
5660 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5661                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5662   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5663          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5664   assert(VM_Version::supports_popcnt(), "");
5665 
5666   bool need_clip = false;
5667   switch(bt) {
5668     case T_BOOLEAN:
5669       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5670       vpxor(xtmp, xtmp, xtmp, vec_enc);
5671       vpsubb(xtmp, xtmp, mask, vec_enc);
5672       vpmovmskb(tmp, xtmp, vec_enc);
5673       need_clip = masklen < 16;
5674       break;
5675     case T_BYTE:
5676       vpmovmskb(tmp, mask, vec_enc);
5677       need_clip = masklen < 16;
5678       break;
5679     case T_SHORT:
5680       vpacksswb(xtmp, mask, mask, vec_enc);
5681       if (masklen >= 16) {
5682         vpermpd(xtmp, xtmp, 8, vec_enc);
5683       }
5684       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5685       need_clip = masklen < 16;
5686       break;
5687     case T_INT:
5688     case T_FLOAT:
5689       vmovmskps(tmp, mask, vec_enc);
5690       need_clip = masklen < 4;
5691       break;
5692     case T_LONG:
5693     case T_DOUBLE:
5694       vmovmskpd(tmp, mask, vec_enc);
5695       need_clip = masklen < 2;
5696       break;
5697     default: assert(false, "Unhandled type, %s", type2name(bt));
5698   }
5699 
5700   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5701   // operations needs to be clipped.
5702   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5703     // need_clip implies masklen < 32
5704     andq(tmp, (1 << masklen) - 1);
5705   }
5706 
5707   vector_mask_operation_helper(opc, dst, tmp, masklen);
5708 }
5709 
5710 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5711                                              Register rtmp2, int mask_len) {
5712   kmov(rtmp1, src);
5713   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5714   mov64(rtmp2, -1L);
5715   pextq(rtmp2, rtmp2, rtmp1);
5716   kmov(dst, rtmp2);
5717 }
5718 
5719 #ifdef _LP64
5720 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5721                                                     XMMRegister mask, Register rtmp, Register rscratch,
5722                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5723                                                     int vec_enc) {
5724   assert(type2aelembytes(bt) >= 4, "");
5725   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5726   address compress_perm_table = nullptr;
5727   address expand_perm_table = nullptr;
5728   if (type2aelembytes(bt) == 8) {
5729     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5730     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5731     vmovmskpd(rtmp, mask, vec_enc);
5732   } else {
5733     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5734     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5735     vmovmskps(rtmp, mask, vec_enc);
5736   }
5737   shlq(rtmp, 5); // for 32 byte permute row.
5738   if (opcode == Op_CompressV) {
5739     lea(rscratch, ExternalAddress(compress_perm_table));
5740   } else {
5741     lea(rscratch, ExternalAddress(expand_perm_table));
5742   }
5743   addptr(rtmp, rscratch);
5744   vmovdqu(permv, Address(rtmp));
5745   vpermps(dst, permv, src, Assembler::AVX_256bit);
5746   vpxor(xtmp, xtmp, xtmp, vec_enc);
5747   // Blend the result with zero vector using permute mask, each column entry
5748   // in a permute table row contains either a valid permute index or a -1 (default)
5749   // value, this can potentially be used as a blending mask after
5750   // compressing/expanding the source vector lanes.
5751   vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
5752 }
5753 #endif
5754 
5755 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5756                                                bool merge, BasicType bt, int vec_enc) {
5757   if (opcode == Op_CompressV) {
5758     switch(bt) {
5759     case T_BYTE:
5760       evpcompressb(dst, mask, src, merge, vec_enc);
5761       break;
5762     case T_CHAR:
5763     case T_SHORT:
5764       evpcompressw(dst, mask, src, merge, vec_enc);
5765       break;
5766     case T_INT:
5767       evpcompressd(dst, mask, src, merge, vec_enc);
5768       break;
5769     case T_FLOAT:
5770       evcompressps(dst, mask, src, merge, vec_enc);
5771       break;
5772     case T_LONG:
5773       evpcompressq(dst, mask, src, merge, vec_enc);
5774       break;
5775     case T_DOUBLE:
5776       evcompresspd(dst, mask, src, merge, vec_enc);
5777       break;
5778     default:
5779       fatal("Unsupported type %s", type2name(bt));
5780       break;
5781     }
5782   } else {
5783     assert(opcode == Op_ExpandV, "");
5784     switch(bt) {
5785     case T_BYTE:
5786       evpexpandb(dst, mask, src, merge, vec_enc);
5787       break;
5788     case T_CHAR:
5789     case T_SHORT:
5790       evpexpandw(dst, mask, src, merge, vec_enc);
5791       break;
5792     case T_INT:
5793       evpexpandd(dst, mask, src, merge, vec_enc);
5794       break;
5795     case T_FLOAT:
5796       evexpandps(dst, mask, src, merge, vec_enc);
5797       break;
5798     case T_LONG:
5799       evpexpandq(dst, mask, src, merge, vec_enc);
5800       break;
5801     case T_DOUBLE:
5802       evexpandpd(dst, mask, src, merge, vec_enc);
5803       break;
5804     default:
5805       fatal("Unsupported type %s", type2name(bt));
5806       break;
5807     }
5808   }
5809 }
5810 #endif
5811 
5812 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5813                                            KRegister ktmp1, int vec_enc) {
5814   if (opcode == Op_SignumVD) {
5815     vsubpd(dst, zero, one, vec_enc);
5816     // if src < 0 ? -1 : 1
5817     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5818     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5819     // if src == NaN, -0.0 or 0.0 return src.
5820     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5821     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5822   } else {
5823     assert(opcode == Op_SignumVF, "");
5824     vsubps(dst, zero, one, vec_enc);
5825     // if src < 0 ? -1 : 1
5826     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5827     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5828     // if src == NaN, -0.0 or 0.0 return src.
5829     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5830     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5831   }
5832 }
5833 
5834 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5835                                           XMMRegister xtmp1, int vec_enc) {
5836   if (opcode == Op_SignumVD) {
5837     vsubpd(dst, zero, one, vec_enc);
5838     // if src < 0 ? -1 : 1
5839     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5840     // if src == NaN, -0.0 or 0.0 return src.
5841     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5842     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5843   } else {
5844     assert(opcode == Op_SignumVF, "");
5845     vsubps(dst, zero, one, vec_enc);
5846     // if src < 0 ? -1 : 1
5847     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5848     // if src == NaN, -0.0 or 0.0 return src.
5849     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5850     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5851   }
5852 }
5853 
5854 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5855   if (VM_Version::supports_avx512bw()) {
5856     if (mask_len > 32) {
5857       kmovql(dst, src);
5858     } else {
5859       kmovdl(dst, src);
5860       if (mask_len != 32) {
5861         kshiftrdl(dst, dst, 32 - mask_len);
5862       }
5863     }
5864   } else {
5865     assert(mask_len <= 16, "");
5866     kmovwl(dst, src);
5867     if (mask_len != 16) {
5868       kshiftrwl(dst, dst, 16 - mask_len);
5869     }
5870   }
5871 }
5872 
5873 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5874   int lane_size = type2aelembytes(bt);
5875   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5876   if ((is_LP64 || lane_size < 8) &&
5877       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5878        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5879     movptr(rtmp, imm32);
5880     switch(lane_size) {
5881       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5882       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5883       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5884       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5885       fatal("Unsupported lane size %d", lane_size);
5886       break;
5887     }
5888   } else {
5889     movptr(rtmp, imm32);
5890     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5891     switch(lane_size) {
5892       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5893       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5894       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5895       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5896       fatal("Unsupported lane size %d", lane_size);
5897       break;
5898     }
5899   }
5900 }
5901 
5902 //
5903 // Following is lookup table based popcount computation algorithm:-
5904 //       Index   Bit set count
5905 //     [ 0000 ->   0,
5906 //       0001 ->   1,
5907 //       0010 ->   1,
5908 //       0011 ->   2,
5909 //       0100 ->   1,
5910 //       0101 ->   2,
5911 //       0110 ->   2,
5912 //       0111 ->   3,
5913 //       1000 ->   1,
5914 //       1001 ->   2,
5915 //       1010 ->   3,
5916 //       1011 ->   3,
5917 //       1100 ->   2,
5918 //       1101 ->   3,
5919 //       1111 ->   4 ]
5920 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5921 //     shuffle indices for lookup table access.
5922 //  b. Right shift each byte of vector lane by 4 positions.
5923 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5924 //     shuffle indices for lookup table access.
5925 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5926 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5927 //     count of all the bytes of a quadword.
5928 //  f. Perform step e. for upper 128bit vector lane.
5929 //  g. Pack the bitset count of quadwords back to double word.
5930 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5931 
5932 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5933                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5934   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5935   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5936   vpsrlw(dst, src, 4, vec_enc);
5937   vpand(dst, dst, xtmp1, vec_enc);
5938   vpand(xtmp1, src, xtmp1, vec_enc);
5939   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5940   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5941   vpshufb(dst, xtmp2, dst, vec_enc);
5942   vpaddb(dst, dst, xtmp1, vec_enc);
5943 }
5944 
5945 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5946                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5947   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5948   // Following code is as per steps e,f,g and h of above algorithm.
5949   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5950   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5951   vpsadbw(dst, dst, xtmp2, vec_enc);
5952   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5953   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5954   vpackuswb(dst, xtmp1, dst, vec_enc);
5955 }
5956 
5957 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5958                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5959   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5960   // Add the popcount of upper and lower bytes of word.
5961   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5962   vpsrlw(dst, xtmp1, 8, vec_enc);
5963   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5964   vpaddw(dst, dst, xtmp1, vec_enc);
5965 }
5966 
5967 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5968                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5969   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5970   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5971   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5972 }
5973 
5974 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5975                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5976   switch(bt) {
5977     case T_LONG:
5978       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5979       break;
5980     case T_INT:
5981       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5982       break;
5983     case T_CHAR:
5984     case T_SHORT:
5985       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5986       break;
5987     case T_BYTE:
5988     case T_BOOLEAN:
5989       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5990       break;
5991     default:
5992       fatal("Unsupported type %s", type2name(bt));
5993       break;
5994   }
5995 }
5996 
5997 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5998                                                       KRegister mask, bool merge, int vec_enc) {
5999   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6000   switch(bt) {
6001     case T_LONG:
6002       assert(VM_Version::supports_avx512_vpopcntdq(), "");
6003       evpopcntq(dst, mask, src, merge, vec_enc);
6004       break;
6005     case T_INT:
6006       assert(VM_Version::supports_avx512_vpopcntdq(), "");
6007       evpopcntd(dst, mask, src, merge, vec_enc);
6008       break;
6009     case T_CHAR:
6010     case T_SHORT:
6011       assert(VM_Version::supports_avx512_bitalg(), "");
6012       evpopcntw(dst, mask, src, merge, vec_enc);
6013       break;
6014     case T_BYTE:
6015     case T_BOOLEAN:
6016       assert(VM_Version::supports_avx512_bitalg(), "");
6017       evpopcntb(dst, mask, src, merge, vec_enc);
6018       break;
6019     default:
6020       fatal("Unsupported type %s", type2name(bt));
6021       break;
6022   }
6023 }
6024 
6025 #ifndef _LP64
6026 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
6027   assert(VM_Version::supports_avx512bw(), "");
6028   kmovdl(tmp, src);
6029   kunpckdql(dst, tmp, tmp);
6030 }
6031 #endif
6032 
6033 // Bit reversal algorithm first reverses the bits of each byte followed by
6034 // a byte level reversal for multi-byte primitive types (short/int/long).
6035 // Algorithm performs a lookup table access to get reverse bit sequence
6036 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
6037 // is obtained by swapping the reverse bit sequences of upper and lower
6038 // nibble of a byte.
6039 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6040                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
6041   if (VM_Version::supports_avx512vlbw()) {
6042 
6043     // Get the reverse bit sequence of lower nibble of each byte.
6044     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
6045     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6046     evpandq(dst, xtmp2, src, vec_enc);
6047     vpshufb(dst, xtmp1, dst, vec_enc);
6048     vpsllq(dst, dst, 4, vec_enc);
6049 
6050     // Get the reverse bit sequence of upper nibble of each byte.
6051     vpandn(xtmp2, xtmp2, src, vec_enc);
6052     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6053     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6054 
6055     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6056     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6057     evporq(xtmp2, dst, xtmp2, vec_enc);
6058     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6059 
6060   } else if(vec_enc == Assembler::AVX_512bit) {
6061     // Shift based bit reversal.
6062     assert(bt == T_LONG || bt == T_INT, "");
6063 
6064     // Swap lower and upper nibble of each byte.
6065     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
6066 
6067     // Swap two least and most significant bits of each nibble.
6068     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
6069 
6070     // Swap adjacent pair of bits.
6071     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6072     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
6073 
6074     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6075     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
6076   } else {
6077     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
6078     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6079 
6080     // Get the reverse bit sequence of lower nibble of each byte.
6081     vpand(dst, xtmp2, src, vec_enc);
6082     vpshufb(dst, xtmp1, dst, vec_enc);
6083     vpsllq(dst, dst, 4, vec_enc);
6084 
6085     // Get the reverse bit sequence of upper nibble of each byte.
6086     vpandn(xtmp2, xtmp2, src, vec_enc);
6087     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6088     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6089 
6090     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6091     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6092     vpor(xtmp2, dst, xtmp2, vec_enc);
6093     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6094   }
6095 }
6096 
6097 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
6098                                                 XMMRegister xtmp, Register rscratch) {
6099   assert(VM_Version::supports_gfni(), "");
6100   assert(rscratch != noreg || always_reachable(mask), "missing");
6101 
6102   // Galois field instruction based bit reversal based on following algorithm.
6103   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6104   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
6105   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
6106   vector_reverse_byte(bt, dst, xtmp, vec_enc);
6107 }
6108 
6109 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
6110                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
6111   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6112   evpandq(dst, xtmp1, src, vec_enc);
6113   vpsllq(dst, dst, nbits, vec_enc);
6114   vpandn(xtmp1, xtmp1, src, vec_enc);
6115   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6116   evporq(dst, dst, xtmp1, vec_enc);
6117 }
6118 
6119 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6120                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
6121   // Shift based bit reversal.
6122   assert(VM_Version::supports_evex(), "");
6123   switch(bt) {
6124     case T_LONG:
6125       // Swap upper and lower double word of each quad word.
6126       evprorq(xtmp1, k0, src, 32, true, vec_enc);
6127       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6128       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6129       break;
6130     case T_INT:
6131       // Swap upper and lower word of each double word.
6132       evprord(xtmp1, k0, src, 16, true, vec_enc);
6133       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6134       break;
6135     case T_CHAR:
6136     case T_SHORT:
6137       // Swap upper and lower byte of each word.
6138       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6139       break;
6140     case T_BYTE:
6141       evmovdquq(dst, k0, src, true, vec_enc);
6142       break;
6143     default:
6144       fatal("Unsupported type %s", type2name(bt));
6145       break;
6146   }
6147 }
6148 
6149 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6150   if (bt == T_BYTE) {
6151     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6152       evmovdquq(dst, k0, src, true, vec_enc);
6153     } else {
6154       vmovdqu(dst, src);
6155     }
6156     return;
6157   }
6158   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6159   // pre-computed shuffle indices.
6160   switch(bt) {
6161     case T_LONG:
6162       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6163       break;
6164     case T_INT:
6165       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6166       break;
6167     case T_CHAR:
6168     case T_SHORT:
6169       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6170       break;
6171     default:
6172       fatal("Unsupported type %s", type2name(bt));
6173       break;
6174   }
6175   vpshufb(dst, src, dst, vec_enc);
6176 }
6177 
6178 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6179                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6180                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6181   assert(is_integral_type(bt), "");
6182   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6183   assert(VM_Version::supports_avx512cd(), "");
6184   switch(bt) {
6185     case T_LONG:
6186       evplzcntq(dst, ktmp, src, merge, vec_enc);
6187       break;
6188     case T_INT:
6189       evplzcntd(dst, ktmp, src, merge, vec_enc);
6190       break;
6191     case T_SHORT:
6192       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6193       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6194       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6195       vpunpckhwd(dst, xtmp1, src, vec_enc);
6196       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6197       vpackusdw(dst, xtmp2, dst, vec_enc);
6198       break;
6199     case T_BYTE:
6200       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6201       // accessing the lookup table.
6202       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6203       // accessing the lookup table.
6204       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6205       assert(VM_Version::supports_avx512bw(), "");
6206       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6207       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6208       vpand(xtmp2, dst, src, vec_enc);
6209       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6210       vpsrlw(xtmp3, src, 4, vec_enc);
6211       vpand(xtmp3, dst, xtmp3, vec_enc);
6212       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6213       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6214       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6215       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6216       break;
6217     default:
6218       fatal("Unsupported type %s", type2name(bt));
6219       break;
6220   }
6221 }
6222 
6223 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6224                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6225   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6226   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6227   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6228   // accessing the lookup table.
6229   vpand(dst, xtmp2, src, vec_enc);
6230   vpshufb(dst, xtmp1, dst, vec_enc);
6231   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6232   // accessing the lookup table.
6233   vpsrlw(xtmp3, src, 4, vec_enc);
6234   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6235   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6236   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6237   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6238   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6239   vpaddb(dst, dst, xtmp2, vec_enc);
6240   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6241 }
6242 
6243 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6244                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6245   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6246   // Add zero counts of lower byte and upper byte of a word if
6247   // upper byte holds a zero value.
6248   vpsrlw(xtmp3, src, 8, vec_enc);
6249   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6250   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6251   vpsllw(xtmp2, dst, 8, vec_enc);
6252   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6253   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6254   vpsrlw(dst, dst, 8, vec_enc);
6255 }
6256 
6257 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6258                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6259   // Since IEEE 754 floating point format represents mantissa in 1.0 format
6260   // hence biased exponent can be used to compute leading zero count as per
6261   // following formula:-
6262   // LZCNT = 32 - (biased_exp - 127)
6263   // Special handling has been introduced for Zero, Max_Int and -ve source values.
6264 
6265   // Broadcast 0xFF
6266   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6267   vpsrld(xtmp1, xtmp1, 24, vec_enc);
6268 
6269   // Extract biased exponent.
6270   vcvtdq2ps(dst, src, vec_enc);
6271   vpsrld(dst, dst, 23, vec_enc);
6272   vpand(dst, dst, xtmp1, vec_enc);
6273 
6274   // Broadcast 127.
6275   vpsrld(xtmp1, xtmp1, 1, vec_enc);
6276   // Exponent = biased_exp - 127
6277   vpsubd(dst, dst, xtmp1, vec_enc);
6278 
6279   // Exponent = Exponent  + 1
6280   vpsrld(xtmp3, xtmp1, 6, vec_enc);
6281   vpaddd(dst, dst, xtmp3, vec_enc);
6282 
6283   // Replace -ve exponent with zero, exponent is -ve when src
6284   // lane contains a zero value.
6285   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6286   vblendvps(dst, dst, xtmp2, dst, vec_enc);
6287 
6288   // Rematerialize broadcast 32.
6289   vpslld(xtmp1, xtmp3, 5, vec_enc);
6290   // Exponent is 32 if corresponding source lane contains max_int value.
6291   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
6292   // LZCNT = 32 - exponent
6293   vpsubd(dst, xtmp1, dst, vec_enc);
6294 
6295   // Replace LZCNT with a value 1 if corresponding source lane
6296   // contains max_int value.
6297   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
6298 
6299   // Replace biased_exp with 0 if source lane value is less than zero.
6300   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6301   vblendvps(dst, dst, xtmp2, src, vec_enc);
6302 }
6303 
6304 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6305                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6306   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6307   // Add zero counts of lower word and upper word of a double word if
6308   // upper word holds a zero value.
6309   vpsrld(xtmp3, src, 16, vec_enc);
6310   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6311   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6312   vpslld(xtmp2, dst, 16, vec_enc);
6313   vpaddd(xtmp2, xtmp2, dst, vec_enc);
6314   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6315   vpsrld(dst, dst, 16, vec_enc);
6316   // Add zero counts of lower doubleword and upper doubleword of a
6317   // quadword if upper doubleword holds a zero value.
6318   vpsrlq(xtmp3, src, 32, vec_enc);
6319   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6320   vpsllq(xtmp2, dst, 32, vec_enc);
6321   vpaddq(xtmp2, xtmp2, dst, vec_enc);
6322   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6323   vpsrlq(dst, dst, 32, vec_enc);
6324 }
6325 
6326 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6327                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6328                                                        Register rtmp, int vec_enc) {
6329   assert(is_integral_type(bt), "unexpected type");
6330   assert(vec_enc < Assembler::AVX_512bit, "");
6331   switch(bt) {
6332     case T_LONG:
6333       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6334       break;
6335     case T_INT:
6336       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6337       break;
6338     case T_SHORT:
6339       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6340       break;
6341     case T_BYTE:
6342       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6343       break;
6344     default:
6345       fatal("Unsupported type %s", type2name(bt));
6346       break;
6347   }
6348 }
6349 
6350 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6351   switch(bt) {
6352     case T_BYTE:
6353       vpsubb(dst, src1, src2, vec_enc);
6354       break;
6355     case T_SHORT:
6356       vpsubw(dst, src1, src2, vec_enc);
6357       break;
6358     case T_INT:
6359       vpsubd(dst, src1, src2, vec_enc);
6360       break;
6361     case T_LONG:
6362       vpsubq(dst, src1, src2, vec_enc);
6363       break;
6364     default:
6365       fatal("Unsupported type %s", type2name(bt));
6366       break;
6367   }
6368 }
6369 
6370 // Trailing zero count computation is based on leading zero count operation as per
6371 // following equation. All AVX3 targets support AVX512CD feature which offers
6372 // direct vector instruction to compute leading zero count.
6373 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6374 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6375                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6376                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6377   assert(is_integral_type(bt), "");
6378   // xtmp = -1
6379   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6380   // xtmp = xtmp + src
6381   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6382   // xtmp = xtmp & ~src
6383   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6384   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6385   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6386   vpsub(bt, dst, xtmp4, dst, vec_enc);
6387 }
6388 
6389 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6390 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6391 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6392                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6393   assert(is_integral_type(bt), "");
6394   // xtmp = 0
6395   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6396   // xtmp = 0 - src
6397   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6398   // xtmp = xtmp | src
6399   vpor(xtmp3, xtmp3, src, vec_enc);
6400   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6401   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6402   vpsub(bt, dst, xtmp1, dst, vec_enc);
6403 }
6404 
6405 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6406   Label done;
6407   Label neg_divisor_fastpath;
6408   cmpl(divisor, 0);
6409   jccb(Assembler::less, neg_divisor_fastpath);
6410   xorl(rdx, rdx);
6411   divl(divisor);
6412   jmpb(done);
6413   bind(neg_divisor_fastpath);
6414   // Fastpath for divisor < 0:
6415   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6416   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6417   movl(rdx, rax);
6418   subl(rdx, divisor);
6419   if (VM_Version::supports_bmi1()) {
6420     andnl(rax, rdx, rax);
6421   } else {
6422     notl(rdx);
6423     andl(rax, rdx);
6424   }
6425   shrl(rax, 31);
6426   bind(done);
6427 }
6428 
6429 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6430   Label done;
6431   Label neg_divisor_fastpath;
6432   cmpl(divisor, 0);
6433   jccb(Assembler::less, neg_divisor_fastpath);
6434   xorl(rdx, rdx);
6435   divl(divisor);
6436   jmpb(done);
6437   bind(neg_divisor_fastpath);
6438   // Fastpath when divisor < 0:
6439   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6440   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6441   movl(rdx, rax);
6442   subl(rax, divisor);
6443   if (VM_Version::supports_bmi1()) {
6444     andnl(rax, rax, rdx);
6445   } else {
6446     notl(rax);
6447     andl(rax, rdx);
6448   }
6449   sarl(rax, 31);
6450   andl(rax, divisor);
6451   subl(rdx, rax);
6452   bind(done);
6453 }
6454 
6455 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6456   Label done;
6457   Label neg_divisor_fastpath;
6458 
6459   cmpl(divisor, 0);
6460   jccb(Assembler::less, neg_divisor_fastpath);
6461   xorl(rdx, rdx);
6462   divl(divisor);
6463   jmpb(done);
6464   bind(neg_divisor_fastpath);
6465   // Fastpath for divisor < 0:
6466   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6467   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6468   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6469   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6470   movl(rdx, rax);
6471   subl(rax, divisor);
6472   if (VM_Version::supports_bmi1()) {
6473     andnl(rax, rax, rdx);
6474   } else {
6475     notl(rax);
6476     andl(rax, rdx);
6477   }
6478   movl(tmp, rax);
6479   shrl(rax, 31); // quotient
6480   sarl(tmp, 31);
6481   andl(tmp, divisor);
6482   subl(rdx, tmp); // remainder
6483   bind(done);
6484 }
6485 
6486 #ifdef _LP64
6487 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6488                                  XMMRegister xtmp2, Register rtmp) {
6489   if(VM_Version::supports_gfni()) {
6490     // Galois field instruction based bit reversal based on following algorithm.
6491     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6492     mov64(rtmp, 0x8040201008040201L);
6493     movq(xtmp1, src);
6494     movq(xtmp2, rtmp);
6495     gf2p8affineqb(xtmp1, xtmp2, 0);
6496     movq(dst, xtmp1);
6497   } else {
6498     // Swap even and odd numbered bits.
6499     movl(rtmp, src);
6500     andl(rtmp, 0x55555555);
6501     shll(rtmp, 1);
6502     movl(dst, src);
6503     andl(dst, 0xAAAAAAAA);
6504     shrl(dst, 1);
6505     orl(dst, rtmp);
6506 
6507     // Swap LSB and MSB 2 bits of each nibble.
6508     movl(rtmp, dst);
6509     andl(rtmp, 0x33333333);
6510     shll(rtmp, 2);
6511     andl(dst, 0xCCCCCCCC);
6512     shrl(dst, 2);
6513     orl(dst, rtmp);
6514 
6515     // Swap LSB and MSB 4 bits of each byte.
6516     movl(rtmp, dst);
6517     andl(rtmp, 0x0F0F0F0F);
6518     shll(rtmp, 4);
6519     andl(dst, 0xF0F0F0F0);
6520     shrl(dst, 4);
6521     orl(dst, rtmp);
6522   }
6523   bswapl(dst);
6524 }
6525 
6526 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6527                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6528   if(VM_Version::supports_gfni()) {
6529     // Galois field instruction based bit reversal based on following algorithm.
6530     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6531     mov64(rtmp1, 0x8040201008040201L);
6532     movq(xtmp1, src);
6533     movq(xtmp2, rtmp1);
6534     gf2p8affineqb(xtmp1, xtmp2, 0);
6535     movq(dst, xtmp1);
6536   } else {
6537     // Swap even and odd numbered bits.
6538     movq(rtmp1, src);
6539     mov64(rtmp2, 0x5555555555555555L);
6540     andq(rtmp1, rtmp2);
6541     shlq(rtmp1, 1);
6542     movq(dst, src);
6543     notq(rtmp2);
6544     andq(dst, rtmp2);
6545     shrq(dst, 1);
6546     orq(dst, rtmp1);
6547 
6548     // Swap LSB and MSB 2 bits of each nibble.
6549     movq(rtmp1, dst);
6550     mov64(rtmp2, 0x3333333333333333L);
6551     andq(rtmp1, rtmp2);
6552     shlq(rtmp1, 2);
6553     notq(rtmp2);
6554     andq(dst, rtmp2);
6555     shrq(dst, 2);
6556     orq(dst, rtmp1);
6557 
6558     // Swap LSB and MSB 4 bits of each byte.
6559     movq(rtmp1, dst);
6560     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6561     andq(rtmp1, rtmp2);
6562     shlq(rtmp1, 4);
6563     notq(rtmp2);
6564     andq(dst, rtmp2);
6565     shrq(dst, 4);
6566     orq(dst, rtmp1);
6567   }
6568   bswapq(dst);
6569 }
6570 
6571 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6572   Label done;
6573   Label neg_divisor_fastpath;
6574   cmpq(divisor, 0);
6575   jccb(Assembler::less, neg_divisor_fastpath);
6576   xorl(rdx, rdx);
6577   divq(divisor);
6578   jmpb(done);
6579   bind(neg_divisor_fastpath);
6580   // Fastpath for divisor < 0:
6581   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6582   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6583   movq(rdx, rax);
6584   subq(rdx, divisor);
6585   if (VM_Version::supports_bmi1()) {
6586     andnq(rax, rdx, rax);
6587   } else {
6588     notq(rdx);
6589     andq(rax, rdx);
6590   }
6591   shrq(rax, 63);
6592   bind(done);
6593 }
6594 
6595 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6596   Label done;
6597   Label neg_divisor_fastpath;
6598   cmpq(divisor, 0);
6599   jccb(Assembler::less, neg_divisor_fastpath);
6600   xorq(rdx, rdx);
6601   divq(divisor);
6602   jmp(done);
6603   bind(neg_divisor_fastpath);
6604   // Fastpath when divisor < 0:
6605   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6606   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6607   movq(rdx, rax);
6608   subq(rax, divisor);
6609   if (VM_Version::supports_bmi1()) {
6610     andnq(rax, rax, rdx);
6611   } else {
6612     notq(rax);
6613     andq(rax, rdx);
6614   }
6615   sarq(rax, 63);
6616   andq(rax, divisor);
6617   subq(rdx, rax);
6618   bind(done);
6619 }
6620 
6621 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6622   Label done;
6623   Label neg_divisor_fastpath;
6624   cmpq(divisor, 0);
6625   jccb(Assembler::less, neg_divisor_fastpath);
6626   xorq(rdx, rdx);
6627   divq(divisor);
6628   jmp(done);
6629   bind(neg_divisor_fastpath);
6630   // Fastpath for divisor < 0:
6631   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6632   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6633   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6634   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6635   movq(rdx, rax);
6636   subq(rax, divisor);
6637   if (VM_Version::supports_bmi1()) {
6638     andnq(rax, rax, rdx);
6639   } else {
6640     notq(rax);
6641     andq(rax, rdx);
6642   }
6643   movq(tmp, rax);
6644   shrq(rax, 63); // quotient
6645   sarq(tmp, 63);
6646   andq(tmp, divisor);
6647   subq(rdx, tmp); // remainder
6648   bind(done);
6649 }
6650 #endif
6651 
6652 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6653                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6654                                         int vlen_enc) {
6655   assert(VM_Version::supports_avx512bw(), "");
6656   // Byte shuffles are inlane operations and indices are determined using
6657   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6658   // normalized to index range 0-15. This makes sure that all the multiples
6659   // of an index value are placed at same relative position in 128 bit
6660   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6661   // will be 16th element in their respective 128 bit lanes.
6662   movl(rtmp, 16);
6663   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6664 
6665   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6666   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6667   // original shuffle indices and move the shuffled lanes corresponding to true
6668   // mask to destination vector.
6669   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6670   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6671   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6672 
6673   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6674   // and broadcasting second 128 bit lane.
6675   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6676   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6677   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6678   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6679   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6680 
6681   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6682   // and broadcasting third 128 bit lane.
6683   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6684   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6685   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6686   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6687   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6688 
6689   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6690   // and broadcasting third 128 bit lane.
6691   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6692   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6693   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6694   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6695   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6696 }
6697 
6698 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6699                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6700   if (vlen_enc == AVX_128bit) {
6701     vpermilps(dst, src, shuffle, vlen_enc);
6702   } else if (bt == T_INT) {
6703     vpermd(dst, shuffle, src, vlen_enc);
6704   } else {
6705     assert(bt == T_FLOAT, "");
6706     vpermps(dst, shuffle, src, vlen_enc);
6707   }
6708 }
6709 
6710 #ifdef _LP64
6711 void C2_MacroAssembler::load_nklass_compact_c2(Register dst, Register obj, Register index, Address::ScaleFactor scale, int disp) {
6712   // Note: Don't clobber obj anywhere in that method!
6713 
6714   // The incoming address is pointing into obj-start + klass_offset_in_bytes. We need to extract
6715   // obj-start, so that we can load from the object's mark-word instead. Usually the address
6716   // comes as obj-start in obj and klass_offset_in_bytes in disp. However, sometimes C2
6717   // emits code that pre-computes obj-start + klass_offset_in_bytes into a register, and
6718   // then passes that register as obj and 0 in disp. The following code extracts the base
6719   // and offset to load the mark-word.
6720   int offset = oopDesc::mark_offset_in_bytes() + disp - oopDesc::klass_offset_in_bytes();
6721   movq(dst, Address(obj, index, scale, offset));
6722   shrq(dst, markWord::klass_shift);
6723 }
6724 #endif