1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 #include "utilities/checkedCast.hpp"
  39 
  40 #ifdef PRODUCT
  41 #define BLOCK_COMMENT(str) /* nothing */
  42 #define STOP(error) stop(error)
  43 #else
  44 #define BLOCK_COMMENT(str) block_comment(str)
  45 #define STOP(error) block_comment(error); stop(error)
  46 #endif
  47 
  48 // C2 compiled method's prolog code.
  49 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  50 
  51   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  52   // NativeJump::patch_verified_entry will be able to patch out the entry
  53   // code safely. The push to verify stack depth is ok at 5 bytes,
  54   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  55   // stack bang then we must use the 6 byte frame allocation even if
  56   // we have no frame. :-(
  57   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  58 
  59   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  60   // Remove word for return addr
  61   framesize -= wordSize;
  62   stack_bang_size -= wordSize;
  63 
  64   // Calls to C2R adapters often do not accept exceptional returns.
  65   // We require that their callers must bang for them.  But be careful, because
  66   // some VM calls (such as call site linkage) can use several kilobytes of
  67   // stack.  But the stack safety zone should account for that.
  68   // See bugs 4446381, 4468289, 4497237.
  69   if (stack_bang_size > 0) {
  70     generate_stack_overflow_check(stack_bang_size);
  71 
  72     // We always push rbp, so that on return to interpreter rbp, will be
  73     // restored correctly and we can correct the stack.
  74     push(rbp);
  75     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  76     if (PreserveFramePointer) {
  77       mov(rbp, rsp);
  78     }
  79     // Remove word for ebp
  80     framesize -= wordSize;
  81 
  82     // Create frame
  83     if (framesize) {
  84       subptr(rsp, framesize);
  85     }
  86   } else {
  87     // Create frame (force generation of a 4 byte immediate value)
  88     subptr_imm32(rsp, framesize);
  89 
  90     // Save RBP register now.
  91     framesize -= wordSize;
  92     movptr(Address(rsp, framesize), rbp);
  93     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  94     if (PreserveFramePointer) {
  95       movptr(rbp, rsp);
  96       if (framesize > 0) {
  97         addptr(rbp, framesize);
  98       }
  99     }
 100   }
 101 
 102   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 103     framesize -= wordSize;
 104     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 105   }
 106 
 107 #ifndef _LP64
 108   // If method sets FPU control word do it now
 109   if (fp_mode_24b) {
 110     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 111   }
 112   if (UseSSE >= 2 && VerifyFPU) {
 113     verify_FPU(0, "FPU stack must be clean on entry");
 114   }
 115 #endif
 116 
 117 #ifdef ASSERT
 118   if (VerifyStackAtCalls) {
 119     Label L;
 120     push(rax);
 121     mov(rax, rsp);
 122     andptr(rax, StackAlignmentInBytes-1);
 123     cmpptr(rax, StackAlignmentInBytes-wordSize);
 124     pop(rax);
 125     jcc(Assembler::equal, L);
 126     STOP("Stack is not properly aligned!");
 127     bind(L);
 128   }
 129 #endif
 130 
 131   if (!is_stub) {
 132     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 133  #ifdef _LP64
 134     if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
 135       // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 136       Label dummy_slow_path;
 137       Label dummy_continuation;
 138       Label* slow_path = &dummy_slow_path;
 139       Label* continuation = &dummy_continuation;
 140       if (!Compile::current()->output()->in_scratch_emit_size()) {
 141         // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 142         C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 143         Compile::current()->output()->add_stub(stub);
 144         slow_path = &stub->entry();
 145         continuation = &stub->continuation();
 146       }
 147       bs->nmethod_entry_barrier(this, slow_path, continuation);
 148     }
 149 #else
 150     // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 151     bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */);
 152 #endif
 153   }
 154 }
 155 
 156 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 157   switch (vlen_in_bytes) {
 158     case  4: // fall-through
 159     case  8: // fall-through
 160     case 16: return Assembler::AVX_128bit;
 161     case 32: return Assembler::AVX_256bit;
 162     case 64: return Assembler::AVX_512bit;
 163 
 164     default: {
 165       ShouldNotReachHere();
 166       return Assembler::AVX_NoVec;
 167     }
 168   }
 169 }
 170 
 171 #if INCLUDE_RTM_OPT
 172 
 173 // Update rtm_counters based on abort status
 174 // input: abort_status
 175 //        rtm_counters (RTMLockingCounters*)
 176 // flags are killed
 177 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
 178 
 179   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
 180   if (PrintPreciseRTMLockingStatistics) {
 181     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
 182       Label check_abort;
 183       testl(abort_status, (1<<i));
 184       jccb(Assembler::equal, check_abort);
 185       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
 186       bind(check_abort);
 187     }
 188   }
 189 }
 190 
 191 // Branch if (random & (count-1) != 0), count is 2^n
 192 // tmp, scr and flags are killed
 193 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
 194   assert(tmp == rax, "");
 195   assert(scr == rdx, "");
 196   rdtsc(); // modifies EDX:EAX
 197   andptr(tmp, count-1);
 198   jccb(Assembler::notZero, brLabel);
 199 }
 200 
 201 // Perform abort ratio calculation, set no_rtm bit if high ratio
 202 // input:  rtm_counters_Reg (RTMLockingCounters* address)
 203 // tmpReg, rtm_counters_Reg and flags are killed
 204 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
 205                                                     Register rtm_counters_Reg,
 206                                                     RTMLockingCounters* rtm_counters,
 207                                                     Metadata* method_data) {
 208   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 209 
 210   if (RTMLockingCalculationDelay > 0) {
 211     // Delay calculation
 212     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()));
 213     testptr(tmpReg, tmpReg);
 214     jccb(Assembler::equal, L_done);
 215   }
 216   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 217   //   Aborted transactions = abort_count * 100
 218   //   All transactions = total_count *  RTMTotalCountIncrRate
 219   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 220 
 221   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 222   cmpptr(tmpReg, RTMAbortThreshold);
 223   jccb(Assembler::below, L_check_always_rtm2);
 224   imulptr(tmpReg, tmpReg, 100);
 225 
 226   Register scrReg = rtm_counters_Reg;
 227   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 228   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 229   imulptr(scrReg, scrReg, RTMAbortRatio);
 230   cmpptr(tmpReg, scrReg);
 231   jccb(Assembler::below, L_check_always_rtm1);
 232   if (method_data != nullptr) {
 233     // set rtm_state to "no rtm" in MDO
 234     mov_metadata(tmpReg, method_data);
 235     lock();
 236     orl(Address(tmpReg, MethodData::rtm_state_offset()), NoRTM);
 237   }
 238   jmpb(L_done);
 239   bind(L_check_always_rtm1);
 240   // Reload RTMLockingCounters* address
 241   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 242   bind(L_check_always_rtm2);
 243   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 244   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 245   jccb(Assembler::below, L_done);
 246   if (method_data != nullptr) {
 247     // set rtm_state to "always rtm" in MDO
 248     mov_metadata(tmpReg, method_data);
 249     lock();
 250     orl(Address(tmpReg, MethodData::rtm_state_offset()), UseRTM);
 251   }
 252   bind(L_done);
 253 }
 254 
 255 // Update counters and perform abort ratio calculation
 256 // input:  abort_status_Reg
 257 // rtm_counters_Reg, flags are killed
 258 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 259                                       Register rtm_counters_Reg,
 260                                       RTMLockingCounters* rtm_counters,
 261                                       Metadata* method_data,
 262                                       bool profile_rtm) {
 263 
 264   assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 265   // update rtm counters based on rax value at abort
 266   // reads abort_status_Reg, updates flags
 267   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 268   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 269   if (profile_rtm) {
 270     // Save abort status because abort_status_Reg is used by following code.
 271     if (RTMRetryCount > 0) {
 272       push(abort_status_Reg);
 273     }
 274     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 275     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 276     // restore abort status
 277     if (RTMRetryCount > 0) {
 278       pop(abort_status_Reg);
 279     }
 280   }
 281 }
 282 
 283 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 284 // inputs: retry_count_Reg
 285 //       : abort_status_Reg
 286 // output: retry_count_Reg decremented by 1
 287 // flags are killed
 288 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 289   Label doneRetry;
 290   assert(abort_status_Reg == rax, "");
 291   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 292   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 293   // if reason is in 0x6 and retry count != 0 then retry
 294   andptr(abort_status_Reg, 0x6);
 295   jccb(Assembler::zero, doneRetry);
 296   testl(retry_count_Reg, retry_count_Reg);
 297   jccb(Assembler::zero, doneRetry);
 298   pause();
 299   decrementl(retry_count_Reg);
 300   jmp(retryLabel);
 301   bind(doneRetry);
 302 }
 303 
 304 // Spin and retry if lock is busy,
 305 // inputs: box_Reg (monitor address)
 306 //       : retry_count_Reg
 307 // output: retry_count_Reg decremented by 1
 308 //       : clear z flag if retry count exceeded
 309 // tmp_Reg, scr_Reg, flags are killed
 310 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 311                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 312   Label SpinLoop, SpinExit, doneRetry;
 313   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 314 
 315   testl(retry_count_Reg, retry_count_Reg);
 316   jccb(Assembler::zero, doneRetry);
 317   decrementl(retry_count_Reg);
 318   movptr(scr_Reg, RTMSpinLoopCount);
 319 
 320   bind(SpinLoop);
 321   pause();
 322   decrementl(scr_Reg);
 323   jccb(Assembler::lessEqual, SpinExit);
 324   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 325   testptr(tmp_Reg, tmp_Reg);
 326   jccb(Assembler::notZero, SpinLoop);
 327 
 328   bind(SpinExit);
 329   jmp(retryLabel);
 330   bind(doneRetry);
 331   incrementl(retry_count_Reg); // clear z flag
 332 }
 333 
 334 // Use RTM for normal stack locks
 335 // Input: objReg (object to lock)
 336 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 337                                          Register retry_on_abort_count_Reg,
 338                                          RTMLockingCounters* stack_rtm_counters,
 339                                          Metadata* method_data, bool profile_rtm,
 340                                          Label& DONE_LABEL, Label& IsInflated) {
 341   assert(UseRTMForStackLocks, "why call this otherwise?");
 342   assert(tmpReg == rax, "");
 343   assert(scrReg == rdx, "");
 344   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 345 
 346   if (RTMRetryCount > 0) {
 347     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 348     bind(L_rtm_retry);
 349   }
 350   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 351   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral
 352   jcc(Assembler::notZero, IsInflated);
 353 
 354   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 355     Label L_noincrement;
 356     if (RTMTotalCountIncrRate > 1) {
 357       // tmpReg, scrReg and flags are killed
 358       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 359     }
 360     assert(stack_rtm_counters != nullptr, "should not be null when profiling RTM");
 361     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 362     bind(L_noincrement);
 363   }
 364   xbegin(L_on_abort);
 365   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 366   andptr(tmpReg, markWord::lock_mask_in_place);     // look at 2 lock bits
 367   cmpptr(tmpReg, markWord::unlocked_value);         // bits = 01 unlocked
 368   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 369 
 370   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 371   if (UseRTMXendForLockBusy) {
 372     xend();
 373     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 374     jmp(L_decrement_retry);
 375   }
 376   else {
 377     xabort(0);
 378   }
 379   bind(L_on_abort);
 380   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 381     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 382   }
 383   bind(L_decrement_retry);
 384   if (RTMRetryCount > 0) {
 385     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 386     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 387   }
 388 }
 389 
 390 // Use RTM for inflating locks
 391 // inputs: objReg (object to lock)
 392 //         boxReg (on-stack box address (displaced header location) - KILLED)
 393 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 394 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 395                                             Register scrReg, Register retry_on_busy_count_Reg,
 396                                             Register retry_on_abort_count_Reg,
 397                                             RTMLockingCounters* rtm_counters,
 398                                             Metadata* method_data, bool profile_rtm,
 399                                             Label& DONE_LABEL) {
 400   assert(UseRTMLocking, "why call this otherwise?");
 401   assert(tmpReg == rax, "");
 402   assert(scrReg == rdx, "");
 403   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 404   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 405 
 406   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 407   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 408 
 409   if (RTMRetryCount > 0) {
 410     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 411     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 412     bind(L_rtm_retry);
 413   }
 414   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 415     Label L_noincrement;
 416     if (RTMTotalCountIncrRate > 1) {
 417       // tmpReg, scrReg and flags are killed
 418       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 419     }
 420     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 421     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 422     bind(L_noincrement);
 423   }
 424   xbegin(L_on_abort);
 425   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 426   movptr(tmpReg, Address(tmpReg, owner_offset));
 427   testptr(tmpReg, tmpReg);
 428   jcc(Assembler::zero, DONE_LABEL);
 429   if (UseRTMXendForLockBusy) {
 430     xend();
 431     jmp(L_decrement_retry);
 432   }
 433   else {
 434     xabort(0);
 435   }
 436   bind(L_on_abort);
 437   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 438   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 439     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 440   }
 441   if (RTMRetryCount > 0) {
 442     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 443     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 444   }
 445 
 446   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 447   testptr(tmpReg, tmpReg) ;
 448   jccb(Assembler::notZero, L_decrement_retry) ;
 449 
 450   // Appears unlocked - try to swing _owner from null to non-null.
 451   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 452 #ifdef _LP64
 453   Register threadReg = r15_thread;
 454 #else
 455   get_thread(scrReg);
 456   Register threadReg = scrReg;
 457 #endif
 458   lock();
 459   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 460 
 461   if (RTMRetryCount > 0) {
 462     // success done else retry
 463     jccb(Assembler::equal, DONE_LABEL) ;
 464     bind(L_decrement_retry);
 465     // Spin and retry if lock is busy.
 466     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 467   }
 468   else {
 469     bind(L_decrement_retry);
 470   }
 471 }
 472 
 473 #endif //  INCLUDE_RTM_OPT
 474 
 475 // fast_lock and fast_unlock used by C2
 476 
 477 // Because the transitions from emitted code to the runtime
 478 // monitorenter/exit helper stubs are so slow it's critical that
 479 // we inline both the stack-locking fast path and the inflated fast path.
 480 //
 481 // See also: cmpFastLock and cmpFastUnlock.
 482 //
 483 // What follows is a specialized inline transliteration of the code
 484 // in enter() and exit(). If we're concerned about I$ bloat another
 485 // option would be to emit TrySlowEnter and TrySlowExit methods
 486 // at startup-time.  These methods would accept arguments as
 487 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 488 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 489 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 490 // In practice, however, the # of lock sites is bounded and is usually small.
 491 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 492 // if the processor uses simple bimodal branch predictors keyed by EIP
 493 // Since the helper routines would be called from multiple synchronization
 494 // sites.
 495 //
 496 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 497 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 498 // to those specialized methods.  That'd give us a mostly platform-independent
 499 // implementation that the JITs could optimize and inline at their pleasure.
 500 // Done correctly, the only time we'd need to cross to native could would be
 501 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 502 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 503 // (b) explicit barriers or fence operations.
 504 //
 505 // TODO:
 506 //
 507 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 508 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 509 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 510 //    the lock operators would typically be faster than reifying Self.
 511 //
 512 // *  Ideally I'd define the primitives as:
 513 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 514 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 515 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 516 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 517 //    Furthermore the register assignments are overconstrained, possibly resulting in
 518 //    sub-optimal code near the synchronization site.
 519 //
 520 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 521 //    Alternately, use a better sp-proximity test.
 522 //
 523 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 524 //    Either one is sufficient to uniquely identify a thread.
 525 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 526 //
 527 // *  Intrinsify notify() and notifyAll() for the common cases where the
 528 //    object is locked by the calling thread but the waitlist is empty.
 529 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 530 //
 531 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 532 //    But beware of excessive branch density on AMD Opterons.
 533 //
 534 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 535 //    or failure of the fast path.  If the fast path fails then we pass
 536 //    control to the slow path, typically in C.  In fast_lock and
 537 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 538 //    will emit a conditional branch immediately after the node.
 539 //    So we have branches to branches and lots of ICC.ZF games.
 540 //    Instead, it might be better to have C2 pass a "FailureLabel"
 541 //    into fast_lock and fast_unlock.  In the case of success, control
 542 //    will drop through the node.  ICC.ZF is undefined at exit.
 543 //    In the case of failure, the node will branch directly to the
 544 //    FailureLabel
 545 
 546 
 547 // obj: object to lock
 548 // box: on-stack box address (displaced header location) - KILLED
 549 // rax,: tmp -- KILLED
 550 // scr: tmp -- KILLED
 551 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 552                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 553                                  RTMLockingCounters* rtm_counters,
 554                                  RTMLockingCounters* stack_rtm_counters,
 555                                  Metadata* method_data,
 556                                  bool use_rtm, bool profile_rtm) {
 557   // Ensure the register assignments are disjoint
 558   assert(tmpReg == rax, "");
 559 
 560   if (use_rtm) {
 561     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 562   } else {
 563     assert(cx1Reg == noreg, "");
 564     assert(cx2Reg == noreg, "");
 565     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 566   }
 567 
 568   // Possible cases that we'll encounter in fast_lock
 569   // ------------------------------------------------
 570   // * Inflated
 571   //    -- unlocked
 572   //    -- Locked
 573   //       = by self
 574   //       = by other
 575   // * neutral
 576   // * stack-locked
 577   //    -- by self
 578   //       = sp-proximity test hits
 579   //       = sp-proximity test generates false-negative
 580   //    -- by other
 581   //
 582 
 583   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 584 
 585   if (DiagnoseSyncOnValueBasedClasses != 0) {
 586     load_klass(tmpReg, objReg, scrReg);
 587     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 588     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 589     jcc(Assembler::notZero, DONE_LABEL);
 590   }
 591 
 592 #if INCLUDE_RTM_OPT
 593   if (UseRTMForStackLocks && use_rtm) {
 594     assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive");
 595     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 596                       stack_rtm_counters, method_data, profile_rtm,
 597                       DONE_LABEL, IsInflated);
 598   }
 599 #endif // INCLUDE_RTM_OPT
 600 
 601   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 602   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 603   jcc(Assembler::notZero, IsInflated);
 604 
 605   if (LockingMode == LM_MONITOR) {
 606     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 607     testptr(objReg, objReg);
 608   } else if (LockingMode == LM_LEGACY) {
 609     // Attempt stack-locking ...
 610     orptr (tmpReg, markWord::unlocked_value);
 611     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 612     lock();
 613     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 614     jcc(Assembler::equal, COUNT);           // Success
 615 
 616     // Recursive locking.
 617     // The object is stack-locked: markword contains stack pointer to BasicLock.
 618     // Locked by current thread if difference with current SP is less than one page.
 619     subptr(tmpReg, rsp);
 620     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 621     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) );
 622     movptr(Address(boxReg, 0), tmpReg);
 623   } else {
 624     assert(LockingMode == LM_LIGHTWEIGHT, "");
 625     lightweight_lock(objReg, tmpReg, thread, scrReg, NO_COUNT);
 626     jmp(COUNT);
 627   }
 628   jmp(DONE_LABEL);
 629 
 630   bind(IsInflated);
 631   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 632 
 633 #if INCLUDE_RTM_OPT
 634   // Use the same RTM locking code in 32- and 64-bit VM.
 635   if (use_rtm) {
 636     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 637                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 638   } else {
 639 #endif // INCLUDE_RTM_OPT
 640 
 641 #ifndef _LP64
 642   // The object is inflated.
 643 
 644   // boxReg refers to the on-stack BasicLock in the current frame.
 645   // We'd like to write:
 646   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 647   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 648   // additional latency as we have another ST in the store buffer that must drain.
 649 
 650   // avoid ST-before-CAS
 651   // register juggle because we need tmpReg for cmpxchgptr below
 652   movptr(scrReg, boxReg);
 653   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 654 
 655   // Optimistic form: consider XORL tmpReg,tmpReg
 656   movptr(tmpReg, NULL_WORD);
 657 
 658   // Appears unlocked - try to swing _owner from null to non-null.
 659   // Ideally, I'd manifest "Self" with get_thread and then attempt
 660   // to CAS the register containing Self into m->Owner.
 661   // But we don't have enough registers, so instead we can either try to CAS
 662   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 663   // we later store "Self" into m->Owner.  Transiently storing a stack address
 664   // (rsp or the address of the box) into  m->owner is harmless.
 665   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 666   lock();
 667   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 668   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 669   // If we weren't able to swing _owner from null to the BasicLock
 670   // then take the slow path.
 671   jccb  (Assembler::notZero, NO_COUNT);
 672   // update _owner from BasicLock to thread
 673   get_thread (scrReg);                    // beware: clobbers ICCs
 674   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 675   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 676 
 677   // If the CAS fails we can either retry or pass control to the slow path.
 678   // We use the latter tactic.
 679   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 680   // If the CAS was successful ...
 681   //   Self has acquired the lock
 682   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 683   // Intentional fall-through into DONE_LABEL ...
 684 #else // _LP64
 685   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 686   movq(scrReg, tmpReg);
 687   xorq(tmpReg, tmpReg);
 688   lock();
 689   cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 690   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 691   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 692   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 693   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 694   jccb(Assembler::equal, COUNT);          // CAS above succeeded; propagate ZF = 1 (success)
 695 
 696   cmpptr(thread, rax);                // Check if we are already the owner (recursive lock)
 697   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 698   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 699   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 700 #endif // _LP64
 701 #if INCLUDE_RTM_OPT
 702   } // use_rtm()
 703 #endif
 704   bind(DONE_LABEL);
 705 
 706   // ZFlag == 1 count in fast path
 707   // ZFlag == 0 count in slow path
 708   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 709 
 710   bind(COUNT);
 711   // Count monitors in fast path
 712   increment(Address(thread, JavaThread::held_monitor_count_offset()));
 713 
 714   xorl(tmpReg, tmpReg); // Set ZF == 1
 715 
 716   bind(NO_COUNT);
 717 
 718   // At NO_COUNT the icc ZFlag is set as follows ...
 719   // fast_unlock uses the same protocol.
 720   // ZFlag == 1 -> Success
 721   // ZFlag == 0 -> Failure - force control through the slow path
 722 }
 723 
 724 // obj: object to unlock
 725 // box: box address (displaced header location), killed.  Must be EAX.
 726 // tmp: killed, cannot be obj nor box.
 727 //
 728 // Some commentary on balanced locking:
 729 //
 730 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 731 // Methods that don't have provably balanced locking are forced to run in the
 732 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 733 // The interpreter provides two properties:
 734 // I1:  At return-time the interpreter automatically and quietly unlocks any
 735 //      objects acquired the current activation (frame).  Recall that the
 736 //      interpreter maintains an on-stack list of locks currently held by
 737 //      a frame.
 738 // I2:  If a method attempts to unlock an object that is not held by the
 739 //      the frame the interpreter throws IMSX.
 740 //
 741 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 742 // B() doesn't have provably balanced locking so it runs in the interpreter.
 743 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 744 // is still locked by A().
 745 //
 746 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 747 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 748 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 749 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 750 // Arguably given that the spec legislates the JNI case as undefined our implementation
 751 // could reasonably *avoid* checking owner in fast_unlock().
 752 // In the interest of performance we elide m->Owner==Self check in unlock.
 753 // A perfectly viable alternative is to elide the owner check except when
 754 // Xcheck:jni is enabled.
 755 
 756 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 757   assert(boxReg == rax, "");
 758   assert_different_registers(objReg, boxReg, tmpReg);
 759 
 760   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 761 
 762 #if INCLUDE_RTM_OPT
 763   if (UseRTMForStackLocks && use_rtm) {
 764     assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive");
 765     Label L_regular_unlock;
 766     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 767     andptr(tmpReg, markWord::lock_mask_in_place);                     // look at 2 lock bits
 768     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 01 unlocked
 769     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 770     xend();                                                           // otherwise end...
 771     jmp(DONE_LABEL);                                                  // ... and we're done
 772     bind(L_regular_unlock);
 773   }
 774 #endif
 775 
 776   if (LockingMode == LM_LEGACY) {
 777     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 778     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 779   }
 780   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 781   if (LockingMode != LM_MONITOR) {
 782     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 783     jcc(Assembler::zero, Stacked);
 784   }
 785 
 786   // It's inflated.
 787   if (LockingMode == LM_LIGHTWEIGHT) {
 788     // If the owner is ANONYMOUS, we need to fix it -  in an outline stub.
 789     testb(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t) ObjectMonitor::ANONYMOUS_OWNER);
 790 #ifdef _LP64
 791     if (!Compile::current()->output()->in_scratch_emit_size()) {
 792       C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmpReg, boxReg);
 793       Compile::current()->output()->add_stub(stub);
 794       jcc(Assembler::notEqual, stub->entry());
 795       bind(stub->continuation());
 796     } else
 797 #endif
 798     {
 799       // We can't easily implement this optimization on 32 bit because we don't have a thread register.
 800       // Call the slow-path instead.
 801       jcc(Assembler::notEqual, NO_COUNT);
 802     }
 803   }
 804 
 805 #if INCLUDE_RTM_OPT
 806   if (use_rtm) {
 807     Label L_regular_inflated_unlock;
 808     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 809     movptr(boxReg, Address(tmpReg, owner_offset));
 810     testptr(boxReg, boxReg);
 811     jccb(Assembler::notZero, L_regular_inflated_unlock);
 812     xend();
 813     jmp(DONE_LABEL);
 814     bind(L_regular_inflated_unlock);
 815   }
 816 #endif
 817 
 818   // Despite our balanced locking property we still check that m->_owner == Self
 819   // as java routines or native JNI code called by this thread might
 820   // have released the lock.
 821   // Refer to the comments in synchronizer.cpp for how we might encode extra
 822   // state in _succ so we can avoid fetching EntryList|cxq.
 823   //
 824   // If there's no contention try a 1-0 exit.  That is, exit without
 825   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 826   // we detect and recover from the race that the 1-0 exit admits.
 827   //
 828   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 829   // before it STs null into _owner, releasing the lock.  Updates
 830   // to data protected by the critical section must be visible before
 831   // we drop the lock (and thus before any other thread could acquire
 832   // the lock and observe the fields protected by the lock).
 833   // IA32's memory-model is SPO, so STs are ordered with respect to
 834   // each other and there's no need for an explicit barrier (fence).
 835   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 836 #ifndef _LP64
 837   // Note that we could employ various encoding schemes to reduce
 838   // the number of loads below (currently 4) to just 2 or 3.
 839   // Refer to the comments in synchronizer.cpp.
 840   // In practice the chain of fetches doesn't seem to impact performance, however.
 841   xorptr(boxReg, boxReg);
 842   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 843   jccb  (Assembler::notZero, DONE_LABEL);
 844   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 845   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 846   jccb  (Assembler::notZero, DONE_LABEL);
 847   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 848   jmpb  (DONE_LABEL);
 849 #else // _LP64
 850   // It's inflated
 851   Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath;
 852 
 853   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 854   jccb(Assembler::equal, LNotRecursive);
 855 
 856   // Recursive inflated unlock
 857   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 858   jmpb(LSuccess);
 859 
 860   bind(LNotRecursive);
 861   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 862   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 863   jccb  (Assembler::notZero, CheckSucc);
 864   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 865   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 866   jmpb  (DONE_LABEL);
 867 
 868   // Try to avoid passing control into the slow_path ...
 869   bind  (CheckSucc);
 870 
 871   // The following optional optimization can be elided if necessary
 872   // Effectively: if (succ == null) goto slow path
 873   // The code reduces the window for a race, however,
 874   // and thus benefits performance.
 875   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 876   jccb  (Assembler::zero, LGoSlowPath);
 877 
 878   xorptr(boxReg, boxReg);
 879   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 880   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 881 
 882   // Memory barrier/fence
 883   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 884   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 885   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 886   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 887   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 888   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 889   lock(); addl(Address(rsp, 0), 0);
 890 
 891   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 892   jccb  (Assembler::notZero, LSuccess);
 893 
 894   // Rare inopportune interleaving - race.
 895   // The successor vanished in the small window above.
 896   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 897   // We need to ensure progress and succession.
 898   // Try to reacquire the lock.
 899   // If that fails then the new owner is responsible for succession and this
 900   // thread needs to take no further action and can exit via the fast path (success).
 901   // If the re-acquire succeeds then pass control into the slow path.
 902   // As implemented, this latter mode is horrible because we generated more
 903   // coherence traffic on the lock *and* artificially extended the critical section
 904   // length while by virtue of passing control into the slow path.
 905 
 906   // box is really RAX -- the following CMPXCHG depends on that binding
 907   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 908   lock();
 909   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 910   // There's no successor so we tried to regrab the lock.
 911   // If that didn't work, then another thread grabbed the
 912   // lock so we're done (and exit was a success).
 913   jccb  (Assembler::notEqual, LSuccess);
 914   // Intentional fall-through into slow path
 915 
 916   bind  (LGoSlowPath);
 917   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 918   jmpb  (DONE_LABEL);
 919 
 920   bind  (LSuccess);
 921   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 922   jmpb  (DONE_LABEL);
 923 
 924 #endif
 925   if (LockingMode != LM_MONITOR) {
 926     bind  (Stacked);
 927     if (LockingMode == LM_LIGHTWEIGHT) {
 928       mov(boxReg, tmpReg);
 929       lightweight_unlock(objReg, boxReg, tmpReg, NO_COUNT);
 930       jmp(COUNT);
 931     } else if (LockingMode == LM_LEGACY) {
 932       movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 933       lock();
 934       cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 935     }
 936     // Intentional fall-thru into DONE_LABEL
 937   }
 938   bind(DONE_LABEL);
 939 
 940   // ZFlag == 1 count in fast path
 941   // ZFlag == 0 count in slow path
 942   jccb(Assembler::notZero, NO_COUNT);
 943 
 944   bind(COUNT);
 945   // Count monitors in fast path
 946 #ifndef _LP64
 947   get_thread(tmpReg);
 948   decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 949 #else // _LP64
 950   decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 951 #endif
 952 
 953   xorl(tmpReg, tmpReg); // Set ZF == 1
 954 
 955   bind(NO_COUNT);
 956 }
 957 
 958 //-------------------------------------------------------------------------------------------
 959 // Generic instructions support for use in .ad files C2 code generation
 960 
 961 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 962   if (dst != src) {
 963     movdqu(dst, src);
 964   }
 965   if (opcode == Op_AbsVD) {
 966     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 967   } else {
 968     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 969     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 970   }
 971 }
 972 
 973 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 974   if (opcode == Op_AbsVD) {
 975     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 976   } else {
 977     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 978     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 979   }
 980 }
 981 
 982 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 983   if (dst != src) {
 984     movdqu(dst, src);
 985   }
 986   if (opcode == Op_AbsVF) {
 987     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 988   } else {
 989     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 990     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 991   }
 992 }
 993 
 994 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 995   if (opcode == Op_AbsVF) {
 996     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 997   } else {
 998     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 999     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
1000   }
1001 }
1002 
1003 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
1004   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1005   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
1006 
1007   if (opcode == Op_MinV) {
1008     if (elem_bt == T_BYTE) {
1009       pminsb(dst, src);
1010     } else if (elem_bt == T_SHORT) {
1011       pminsw(dst, src);
1012     } else if (elem_bt == T_INT) {
1013       pminsd(dst, src);
1014     } else {
1015       assert(elem_bt == T_LONG, "required");
1016       assert(tmp == xmm0, "required");
1017       assert_different_registers(dst, src, tmp);
1018       movdqu(xmm0, dst);
1019       pcmpgtq(xmm0, src);
1020       blendvpd(dst, src);  // xmm0 as mask
1021     }
1022   } else { // opcode == Op_MaxV
1023     if (elem_bt == T_BYTE) {
1024       pmaxsb(dst, src);
1025     } else if (elem_bt == T_SHORT) {
1026       pmaxsw(dst, src);
1027     } else if (elem_bt == T_INT) {
1028       pmaxsd(dst, src);
1029     } else {
1030       assert(elem_bt == T_LONG, "required");
1031       assert(tmp == xmm0, "required");
1032       assert_different_registers(dst, src, tmp);
1033       movdqu(xmm0, src);
1034       pcmpgtq(xmm0, dst);
1035       blendvpd(dst, src);  // xmm0 as mask
1036     }
1037   }
1038 }
1039 
1040 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1041                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1042                                  int vlen_enc) {
1043   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1044 
1045   if (opcode == Op_MinV) {
1046     if (elem_bt == T_BYTE) {
1047       vpminsb(dst, src1, src2, vlen_enc);
1048     } else if (elem_bt == T_SHORT) {
1049       vpminsw(dst, src1, src2, vlen_enc);
1050     } else if (elem_bt == T_INT) {
1051       vpminsd(dst, src1, src2, vlen_enc);
1052     } else {
1053       assert(elem_bt == T_LONG, "required");
1054       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1055         vpminsq(dst, src1, src2, vlen_enc);
1056       } else {
1057         assert_different_registers(dst, src1, src2);
1058         vpcmpgtq(dst, src1, src2, vlen_enc);
1059         vblendvpd(dst, src1, src2, dst, vlen_enc);
1060       }
1061     }
1062   } else { // opcode == Op_MaxV
1063     if (elem_bt == T_BYTE) {
1064       vpmaxsb(dst, src1, src2, vlen_enc);
1065     } else if (elem_bt == T_SHORT) {
1066       vpmaxsw(dst, src1, src2, vlen_enc);
1067     } else if (elem_bt == T_INT) {
1068       vpmaxsd(dst, src1, src2, vlen_enc);
1069     } else {
1070       assert(elem_bt == T_LONG, "required");
1071       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1072         vpmaxsq(dst, src1, src2, vlen_enc);
1073       } else {
1074         assert_different_registers(dst, src1, src2);
1075         vpcmpgtq(dst, src1, src2, vlen_enc);
1076         vblendvpd(dst, src2, src1, dst, vlen_enc);
1077       }
1078     }
1079   }
1080 }
1081 
1082 // Float/Double min max
1083 
1084 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1085                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1086                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1087                                    int vlen_enc) {
1088   assert(UseAVX > 0, "required");
1089   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1090          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1091   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1092   assert_different_registers(a, tmp, atmp, btmp);
1093   assert_different_registers(b, tmp, atmp, btmp);
1094 
1095   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1096   bool is_double_word = is_double_word_type(elem_bt);
1097 
1098   /* Note on 'non-obvious' assembly sequence:
1099    *
1100    * While there are vminps/vmaxps instructions, there are two important differences between hardware
1101    * and Java on how they handle floats:
1102    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
1103    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
1104    *
1105    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
1106    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
1107    *                (only useful when signs differ, noop otherwise)
1108    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
1109 
1110    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
1111    *   btmp = (b < +0.0) ? a : b
1112    *   atmp = (b < +0.0) ? b : a
1113    *   Tmp  = Max_Float(atmp , btmp)
1114    *   Res  = (atmp == NaN) ? atmp : Tmp
1115    */
1116 
1117   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
1118   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
1119   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
1120   XMMRegister mask;
1121 
1122   if (!is_double_word && is_min) {
1123     mask = a;
1124     vblend = &MacroAssembler::vblendvps;
1125     vmaxmin = &MacroAssembler::vminps;
1126     vcmp = &MacroAssembler::vcmpps;
1127   } else if (!is_double_word && !is_min) {
1128     mask = b;
1129     vblend = &MacroAssembler::vblendvps;
1130     vmaxmin = &MacroAssembler::vmaxps;
1131     vcmp = &MacroAssembler::vcmpps;
1132   } else if (is_double_word && is_min) {
1133     mask = a;
1134     vblend = &MacroAssembler::vblendvpd;
1135     vmaxmin = &MacroAssembler::vminpd;
1136     vcmp = &MacroAssembler::vcmppd;
1137   } else {
1138     assert(is_double_word && !is_min, "sanity");
1139     mask = b;
1140     vblend = &MacroAssembler::vblendvpd;
1141     vmaxmin = &MacroAssembler::vmaxpd;
1142     vcmp = &MacroAssembler::vcmppd;
1143   }
1144 
1145   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1146   XMMRegister maxmin, scratch;
1147   if (dst == btmp) {
1148     maxmin = btmp;
1149     scratch = tmp;
1150   } else {
1151     maxmin = tmp;
1152     scratch = btmp;
1153   }
1154 
1155   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1156   if (precompute_mask && !is_double_word) {
1157     vpsrad(tmp, mask, 32, vlen_enc);
1158     mask = tmp;
1159   } else if (precompute_mask && is_double_word) {
1160     vpxor(tmp, tmp, tmp, vlen_enc);
1161     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1162     mask = tmp;
1163   }
1164 
1165   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1166   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1167   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1168   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1169   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1170 }
1171 
1172 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1173                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1174                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1175                                     int vlen_enc) {
1176   assert(UseAVX > 2, "required");
1177   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1178          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1179   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1180   assert_different_registers(dst, a, atmp, btmp);
1181   assert_different_registers(dst, b, atmp, btmp);
1182 
1183   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1184   bool is_double_word = is_double_word_type(elem_bt);
1185   bool merge = true;
1186 
1187   if (!is_double_word && is_min) {
1188     evpmovd2m(ktmp, a, vlen_enc);
1189     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1190     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1191     vminps(dst, atmp, btmp, vlen_enc);
1192     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1193     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1194   } else if (!is_double_word && !is_min) {
1195     evpmovd2m(ktmp, b, vlen_enc);
1196     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1197     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1198     vmaxps(dst, atmp, btmp, vlen_enc);
1199     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1200     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1201   } else if (is_double_word && is_min) {
1202     evpmovq2m(ktmp, a, vlen_enc);
1203     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1204     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1205     vminpd(dst, atmp, btmp, vlen_enc);
1206     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1207     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1208   } else {
1209     assert(is_double_word && !is_min, "sanity");
1210     evpmovq2m(ktmp, b, vlen_enc);
1211     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1212     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1213     vmaxpd(dst, atmp, btmp, vlen_enc);
1214     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1215     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1216   }
1217 }
1218 
1219 // Float/Double signum
1220 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1221   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1222 
1223   Label DONE_LABEL;
1224 
1225   if (opcode == Op_SignumF) {
1226     assert(UseSSE > 0, "required");
1227     ucomiss(dst, zero);
1228     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1229     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1230     movflt(dst, one);
1231     jcc(Assembler::above, DONE_LABEL);
1232     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1233   } else if (opcode == Op_SignumD) {
1234     assert(UseSSE > 1, "required");
1235     ucomisd(dst, zero);
1236     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1237     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1238     movdbl(dst, one);
1239     jcc(Assembler::above, DONE_LABEL);
1240     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1241   }
1242 
1243   bind(DONE_LABEL);
1244 }
1245 
1246 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1247   if (sign) {
1248     pmovsxbw(dst, src);
1249   } else {
1250     pmovzxbw(dst, src);
1251   }
1252 }
1253 
1254 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1255   if (sign) {
1256     vpmovsxbw(dst, src, vector_len);
1257   } else {
1258     vpmovzxbw(dst, src, vector_len);
1259   }
1260 }
1261 
1262 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1263   if (sign) {
1264     vpmovsxbd(dst, src, vector_len);
1265   } else {
1266     vpmovzxbd(dst, src, vector_len);
1267   }
1268 }
1269 
1270 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1271   if (sign) {
1272     vpmovsxwd(dst, src, vector_len);
1273   } else {
1274     vpmovzxwd(dst, src, vector_len);
1275   }
1276 }
1277 
1278 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1279                                      int shift, int vector_len) {
1280   if (opcode == Op_RotateLeftV) {
1281     if (etype == T_INT) {
1282       evprold(dst, src, shift, vector_len);
1283     } else {
1284       assert(etype == T_LONG, "expected type T_LONG");
1285       evprolq(dst, src, shift, vector_len);
1286     }
1287   } else {
1288     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1289     if (etype == T_INT) {
1290       evprord(dst, src, shift, vector_len);
1291     } else {
1292       assert(etype == T_LONG, "expected type T_LONG");
1293       evprorq(dst, src, shift, vector_len);
1294     }
1295   }
1296 }
1297 
1298 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1299                                      XMMRegister shift, int vector_len) {
1300   if (opcode == Op_RotateLeftV) {
1301     if (etype == T_INT) {
1302       evprolvd(dst, src, shift, vector_len);
1303     } else {
1304       assert(etype == T_LONG, "expected type T_LONG");
1305       evprolvq(dst, src, shift, vector_len);
1306     }
1307   } else {
1308     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1309     if (etype == T_INT) {
1310       evprorvd(dst, src, shift, vector_len);
1311     } else {
1312       assert(etype == T_LONG, "expected type T_LONG");
1313       evprorvq(dst, src, shift, vector_len);
1314     }
1315   }
1316 }
1317 
1318 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1319   if (opcode == Op_RShiftVI) {
1320     psrad(dst, shift);
1321   } else if (opcode == Op_LShiftVI) {
1322     pslld(dst, shift);
1323   } else {
1324     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1325     psrld(dst, shift);
1326   }
1327 }
1328 
1329 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1330   switch (opcode) {
1331     case Op_RShiftVI:  psrad(dst, shift); break;
1332     case Op_LShiftVI:  pslld(dst, shift); break;
1333     case Op_URShiftVI: psrld(dst, shift); break;
1334 
1335     default: assert(false, "%s", NodeClassNames[opcode]);
1336   }
1337 }
1338 
1339 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1340   if (opcode == Op_RShiftVI) {
1341     vpsrad(dst, nds, shift, vector_len);
1342   } else if (opcode == Op_LShiftVI) {
1343     vpslld(dst, nds, shift, vector_len);
1344   } else {
1345     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1346     vpsrld(dst, nds, shift, vector_len);
1347   }
1348 }
1349 
1350 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1351   switch (opcode) {
1352     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1353     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1354     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1355 
1356     default: assert(false, "%s", NodeClassNames[opcode]);
1357   }
1358 }
1359 
1360 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1361   switch (opcode) {
1362     case Op_RShiftVB:  // fall-through
1363     case Op_RShiftVS:  psraw(dst, shift); break;
1364 
1365     case Op_LShiftVB:  // fall-through
1366     case Op_LShiftVS:  psllw(dst, shift);   break;
1367 
1368     case Op_URShiftVS: // fall-through
1369     case Op_URShiftVB: psrlw(dst, shift);  break;
1370 
1371     default: assert(false, "%s", NodeClassNames[opcode]);
1372   }
1373 }
1374 
1375 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1376   switch (opcode) {
1377     case Op_RShiftVB:  // fall-through
1378     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1379 
1380     case Op_LShiftVB:  // fall-through
1381     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1382 
1383     case Op_URShiftVS: // fall-through
1384     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1385 
1386     default: assert(false, "%s", NodeClassNames[opcode]);
1387   }
1388 }
1389 
1390 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1391   switch (opcode) {
1392     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1393     case Op_LShiftVL:  psllq(dst, shift); break;
1394     case Op_URShiftVL: psrlq(dst, shift); break;
1395 
1396     default: assert(false, "%s", NodeClassNames[opcode]);
1397   }
1398 }
1399 
1400 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1401   if (opcode == Op_RShiftVL) {
1402     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1403   } else if (opcode == Op_LShiftVL) {
1404     psllq(dst, shift);
1405   } else {
1406     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1407     psrlq(dst, shift);
1408   }
1409 }
1410 
1411 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1412   switch (opcode) {
1413     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1414     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1415     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1416 
1417     default: assert(false, "%s", NodeClassNames[opcode]);
1418   }
1419 }
1420 
1421 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1422   if (opcode == Op_RShiftVL) {
1423     evpsraq(dst, nds, shift, vector_len);
1424   } else if (opcode == Op_LShiftVL) {
1425     vpsllq(dst, nds, shift, vector_len);
1426   } else {
1427     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1428     vpsrlq(dst, nds, shift, vector_len);
1429   }
1430 }
1431 
1432 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1433   switch (opcode) {
1434     case Op_RShiftVB:  // fall-through
1435     case Op_RShiftVS:  // fall-through
1436     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1437 
1438     case Op_LShiftVB:  // fall-through
1439     case Op_LShiftVS:  // fall-through
1440     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1441 
1442     case Op_URShiftVB: // fall-through
1443     case Op_URShiftVS: // fall-through
1444     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1445 
1446     default: assert(false, "%s", NodeClassNames[opcode]);
1447   }
1448 }
1449 
1450 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1451   switch (opcode) {
1452     case Op_RShiftVB:  // fall-through
1453     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1454 
1455     case Op_LShiftVB:  // fall-through
1456     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1457 
1458     case Op_URShiftVB: // fall-through
1459     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1460 
1461     default: assert(false, "%s", NodeClassNames[opcode]);
1462   }
1463 }
1464 
1465 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1466   assert(UseAVX >= 2, "required");
1467   switch (opcode) {
1468     case Op_RShiftVL: {
1469       if (UseAVX > 2) {
1470         assert(tmp == xnoreg, "not used");
1471         if (!VM_Version::supports_avx512vl()) {
1472           vlen_enc = Assembler::AVX_512bit;
1473         }
1474         evpsravq(dst, src, shift, vlen_enc);
1475       } else {
1476         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1477         vpsrlvq(dst, src, shift, vlen_enc);
1478         vpsrlvq(tmp, tmp, shift, vlen_enc);
1479         vpxor(dst, dst, tmp, vlen_enc);
1480         vpsubq(dst, dst, tmp, vlen_enc);
1481       }
1482       break;
1483     }
1484     case Op_LShiftVL: {
1485       assert(tmp == xnoreg, "not used");
1486       vpsllvq(dst, src, shift, vlen_enc);
1487       break;
1488     }
1489     case Op_URShiftVL: {
1490       assert(tmp == xnoreg, "not used");
1491       vpsrlvq(dst, src, shift, vlen_enc);
1492       break;
1493     }
1494     default: assert(false, "%s", NodeClassNames[opcode]);
1495   }
1496 }
1497 
1498 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1499 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1500   assert(opcode == Op_LShiftVB ||
1501          opcode == Op_RShiftVB ||
1502          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1503   bool sign = (opcode != Op_URShiftVB);
1504   assert(vector_len == 0, "required");
1505   vextendbd(sign, dst, src, 1);
1506   vpmovzxbd(vtmp, shift, 1);
1507   varshiftd(opcode, dst, dst, vtmp, 1);
1508   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1509   vextracti128_high(vtmp, dst);
1510   vpackusdw(dst, dst, vtmp, 0);
1511 }
1512 
1513 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1514 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1515   assert(opcode == Op_LShiftVB ||
1516          opcode == Op_RShiftVB ||
1517          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1518   bool sign = (opcode != Op_URShiftVB);
1519   int ext_vector_len = vector_len + 1;
1520   vextendbw(sign, dst, src, ext_vector_len);
1521   vpmovzxbw(vtmp, shift, ext_vector_len);
1522   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1523   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1524   if (vector_len == 0) {
1525     vextracti128_high(vtmp, dst);
1526     vpackuswb(dst, dst, vtmp, vector_len);
1527   } else {
1528     vextracti64x4_high(vtmp, dst);
1529     vpackuswb(dst, dst, vtmp, vector_len);
1530     vpermq(dst, dst, 0xD8, vector_len);
1531   }
1532 }
1533 
1534 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1535   switch(typ) {
1536     case T_BYTE:
1537       pinsrb(dst, val, idx);
1538       break;
1539     case T_SHORT:
1540       pinsrw(dst, val, idx);
1541       break;
1542     case T_INT:
1543       pinsrd(dst, val, idx);
1544       break;
1545     case T_LONG:
1546       pinsrq(dst, val, idx);
1547       break;
1548     default:
1549       assert(false,"Should not reach here.");
1550       break;
1551   }
1552 }
1553 
1554 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1555   switch(typ) {
1556     case T_BYTE:
1557       vpinsrb(dst, src, val, idx);
1558       break;
1559     case T_SHORT:
1560       vpinsrw(dst, src, val, idx);
1561       break;
1562     case T_INT:
1563       vpinsrd(dst, src, val, idx);
1564       break;
1565     case T_LONG:
1566       vpinsrq(dst, src, val, idx);
1567       break;
1568     default:
1569       assert(false,"Should not reach here.");
1570       break;
1571   }
1572 }
1573 
1574 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1575   switch(typ) {
1576     case T_INT:
1577       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1578       break;
1579     case T_FLOAT:
1580       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1581       break;
1582     case T_LONG:
1583       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1584       break;
1585     case T_DOUBLE:
1586       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1587       break;
1588     default:
1589       assert(false,"Should not reach here.");
1590       break;
1591   }
1592 }
1593 
1594 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1595   switch(typ) {
1596     case T_INT:
1597       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1598       break;
1599     case T_FLOAT:
1600       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1601       break;
1602     case T_LONG:
1603       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1604       break;
1605     case T_DOUBLE:
1606       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1607       break;
1608     default:
1609       assert(false,"Should not reach here.");
1610       break;
1611   }
1612 }
1613 
1614 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1615   switch(typ) {
1616     case T_INT:
1617       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1618       break;
1619     case T_FLOAT:
1620       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1621       break;
1622     case T_LONG:
1623       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1624       break;
1625     case T_DOUBLE:
1626       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1627       break;
1628     default:
1629       assert(false,"Should not reach here.");
1630       break;
1631   }
1632 }
1633 
1634 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1635   if (vlen_in_bytes <= 16) {
1636     pxor (dst, dst);
1637     psubb(dst, src);
1638     switch (elem_bt) {
1639       case T_BYTE:   /* nothing to do */ break;
1640       case T_SHORT:  pmovsxbw(dst, dst); break;
1641       case T_INT:    pmovsxbd(dst, dst); break;
1642       case T_FLOAT:  pmovsxbd(dst, dst); break;
1643       case T_LONG:   pmovsxbq(dst, dst); break;
1644       case T_DOUBLE: pmovsxbq(dst, dst); break;
1645 
1646       default: assert(false, "%s", type2name(elem_bt));
1647     }
1648   } else {
1649     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1650     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1651 
1652     vpxor (dst, dst, dst, vlen_enc);
1653     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1654 
1655     switch (elem_bt) {
1656       case T_BYTE:   /* nothing to do */            break;
1657       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1658       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1659       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1660       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1661       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1662 
1663       default: assert(false, "%s", type2name(elem_bt));
1664     }
1665   }
1666 }
1667 
1668 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1669   if (novlbwdq) {
1670     vpmovsxbd(xtmp, src, vlen_enc);
1671     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1672             Assembler::eq, true, vlen_enc, noreg);
1673   } else {
1674     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1675     vpsubb(xtmp, xtmp, src, vlen_enc);
1676     evpmovb2m(dst, xtmp, vlen_enc);
1677   }
1678 }
1679 
1680 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
1681   switch (vlen_in_bytes) {
1682     case 4:  movdl(dst, src);   break;
1683     case 8:  movq(dst, src);    break;
1684     case 16: movdqu(dst, src);  break;
1685     case 32: vmovdqu(dst, src); break;
1686     case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1687     default: ShouldNotReachHere();
1688   }
1689 }
1690 
1691 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1692   assert(rscratch != noreg || always_reachable(src), "missing");
1693 
1694   if (reachable(src)) {
1695     load_vector(dst, as_Address(src), vlen_in_bytes);
1696   } else {
1697     lea(rscratch, src);
1698     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
1699   }
1700 }
1701 
1702 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1703   int vlen_enc = vector_length_encoding(vlen);
1704   if (VM_Version::supports_avx()) {
1705     if (bt == T_LONG) {
1706       if (VM_Version::supports_avx2()) {
1707         vpbroadcastq(dst, src, vlen_enc);
1708       } else {
1709         vmovddup(dst, src, vlen_enc);
1710       }
1711     } else if (bt == T_DOUBLE) {
1712       if (vlen_enc != Assembler::AVX_128bit) {
1713         vbroadcastsd(dst, src, vlen_enc, noreg);
1714       } else {
1715         vmovddup(dst, src, vlen_enc);
1716       }
1717     } else {
1718       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1719         vpbroadcastd(dst, src, vlen_enc);
1720       } else {
1721         vbroadcastss(dst, src, vlen_enc);
1722       }
1723     }
1724   } else if (VM_Version::supports_sse3()) {
1725     movddup(dst, src);
1726   } else {
1727     movq(dst, src);
1728     if (vlen == 16) {
1729       punpcklqdq(dst, dst);
1730     }
1731   }
1732 }
1733 
1734 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1735   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1736   int offset = exact_log2(type2aelembytes(bt)) << 6;
1737   if (is_floating_point_type(bt)) {
1738     offset += 128;
1739   }
1740   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1741   load_vector(dst, addr, vlen_in_bytes);
1742 }
1743 
1744 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1745 
1746 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1747   int vector_len = Assembler::AVX_128bit;
1748 
1749   switch (opcode) {
1750     case Op_AndReductionV:  pand(dst, src); break;
1751     case Op_OrReductionV:   por (dst, src); break;
1752     case Op_XorReductionV:  pxor(dst, src); break;
1753     case Op_MinReductionV:
1754       switch (typ) {
1755         case T_BYTE:        pminsb(dst, src); break;
1756         case T_SHORT:       pminsw(dst, src); break;
1757         case T_INT:         pminsd(dst, src); break;
1758         case T_LONG:        assert(UseAVX > 2, "required");
1759                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1760         default:            assert(false, "wrong type");
1761       }
1762       break;
1763     case Op_MaxReductionV:
1764       switch (typ) {
1765         case T_BYTE:        pmaxsb(dst, src); break;
1766         case T_SHORT:       pmaxsw(dst, src); break;
1767         case T_INT:         pmaxsd(dst, src); break;
1768         case T_LONG:        assert(UseAVX > 2, "required");
1769                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1770         default:            assert(false, "wrong type");
1771       }
1772       break;
1773     case Op_AddReductionVF: addss(dst, src); break;
1774     case Op_AddReductionVD: addsd(dst, src); break;
1775     case Op_AddReductionVI:
1776       switch (typ) {
1777         case T_BYTE:        paddb(dst, src); break;
1778         case T_SHORT:       paddw(dst, src); break;
1779         case T_INT:         paddd(dst, src); break;
1780         default:            assert(false, "wrong type");
1781       }
1782       break;
1783     case Op_AddReductionVL: paddq(dst, src); break;
1784     case Op_MulReductionVF: mulss(dst, src); break;
1785     case Op_MulReductionVD: mulsd(dst, src); break;
1786     case Op_MulReductionVI:
1787       switch (typ) {
1788         case T_SHORT:       pmullw(dst, src); break;
1789         case T_INT:         pmulld(dst, src); break;
1790         default:            assert(false, "wrong type");
1791       }
1792       break;
1793     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1794                             evpmullq(dst, dst, src, vector_len); break;
1795     default:                assert(false, "wrong opcode");
1796   }
1797 }
1798 
1799 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1800   int vector_len = Assembler::AVX_256bit;
1801 
1802   switch (opcode) {
1803     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1804     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1805     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1806     case Op_MinReductionV:
1807       switch (typ) {
1808         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1809         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1810         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1811         case T_LONG:        assert(UseAVX > 2, "required");
1812                             vpminsq(dst, src1, src2, vector_len); break;
1813         default:            assert(false, "wrong type");
1814       }
1815       break;
1816     case Op_MaxReductionV:
1817       switch (typ) {
1818         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1819         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1820         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1821         case T_LONG:        assert(UseAVX > 2, "required");
1822                             vpmaxsq(dst, src1, src2, vector_len); break;
1823         default:            assert(false, "wrong type");
1824       }
1825       break;
1826     case Op_AddReductionVI:
1827       switch (typ) {
1828         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1829         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1830         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1831         default:            assert(false, "wrong type");
1832       }
1833       break;
1834     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1835     case Op_MulReductionVI:
1836       switch (typ) {
1837         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1838         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1839         default:            assert(false, "wrong type");
1840       }
1841       break;
1842     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1843     default:                assert(false, "wrong opcode");
1844   }
1845 }
1846 
1847 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1848                                   XMMRegister dst, XMMRegister src,
1849                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1850   switch (opcode) {
1851     case Op_AddReductionVF:
1852     case Op_MulReductionVF:
1853       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1854       break;
1855 
1856     case Op_AddReductionVD:
1857     case Op_MulReductionVD:
1858       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1859       break;
1860 
1861     default: assert(false, "wrong opcode");
1862   }
1863 }
1864 
1865 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1866                              Register dst, Register src1, XMMRegister src2,
1867                              XMMRegister vtmp1, XMMRegister vtmp2) {
1868   switch (vlen) {
1869     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1870     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1871     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1872     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1873 
1874     default: assert(false, "wrong vector length");
1875   }
1876 }
1877 
1878 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1879                              Register dst, Register src1, XMMRegister src2,
1880                              XMMRegister vtmp1, XMMRegister vtmp2) {
1881   switch (vlen) {
1882     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1883     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1884     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1885     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1886 
1887     default: assert(false, "wrong vector length");
1888   }
1889 }
1890 
1891 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1892                              Register dst, Register src1, XMMRegister src2,
1893                              XMMRegister vtmp1, XMMRegister vtmp2) {
1894   switch (vlen) {
1895     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1896     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1897     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1898     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1899 
1900     default: assert(false, "wrong vector length");
1901   }
1902 }
1903 
1904 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1905                              Register dst, Register src1, XMMRegister src2,
1906                              XMMRegister vtmp1, XMMRegister vtmp2) {
1907   switch (vlen) {
1908     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1909     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1910     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1911     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1912 
1913     default: assert(false, "wrong vector length");
1914   }
1915 }
1916 
1917 #ifdef _LP64
1918 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1919                              Register dst, Register src1, XMMRegister src2,
1920                              XMMRegister vtmp1, XMMRegister vtmp2) {
1921   switch (vlen) {
1922     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1923     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1924     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1925 
1926     default: assert(false, "wrong vector length");
1927   }
1928 }
1929 #endif // _LP64
1930 
1931 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1932   switch (vlen) {
1933     case 2:
1934       assert(vtmp2 == xnoreg, "");
1935       reduce2F(opcode, dst, src, vtmp1);
1936       break;
1937     case 4:
1938       assert(vtmp2 == xnoreg, "");
1939       reduce4F(opcode, dst, src, vtmp1);
1940       break;
1941     case 8:
1942       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1943       break;
1944     case 16:
1945       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1946       break;
1947     default: assert(false, "wrong vector length");
1948   }
1949 }
1950 
1951 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1952   switch (vlen) {
1953     case 2:
1954       assert(vtmp2 == xnoreg, "");
1955       reduce2D(opcode, dst, src, vtmp1);
1956       break;
1957     case 4:
1958       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1959       break;
1960     case 8:
1961       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1962       break;
1963     default: assert(false, "wrong vector length");
1964   }
1965 }
1966 
1967 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1968   if (opcode == Op_AddReductionVI) {
1969     if (vtmp1 != src2) {
1970       movdqu(vtmp1, src2);
1971     }
1972     phaddd(vtmp1, vtmp1);
1973   } else {
1974     pshufd(vtmp1, src2, 0x1);
1975     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1976   }
1977   movdl(vtmp2, src1);
1978   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1979   movdl(dst, vtmp1);
1980 }
1981 
1982 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1983   if (opcode == Op_AddReductionVI) {
1984     if (vtmp1 != src2) {
1985       movdqu(vtmp1, src2);
1986     }
1987     phaddd(vtmp1, src2);
1988     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1989   } else {
1990     pshufd(vtmp2, src2, 0xE);
1991     reduce_operation_128(T_INT, opcode, vtmp2, src2);
1992     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1993   }
1994 }
1995 
1996 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1997   if (opcode == Op_AddReductionVI) {
1998     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1999     vextracti128_high(vtmp2, vtmp1);
2000     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2001     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2002   } else {
2003     vextracti128_high(vtmp1, src2);
2004     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2005     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2006   }
2007 }
2008 
2009 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2010   vextracti64x4_high(vtmp2, src2);
2011   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2012   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2013 }
2014 
2015 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2016   pshufd(vtmp2, src2, 0x1);
2017   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2018   movdqu(vtmp1, vtmp2);
2019   psrldq(vtmp1, 2);
2020   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2021   movdqu(vtmp2, vtmp1);
2022   psrldq(vtmp2, 1);
2023   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2024   movdl(vtmp2, src1);
2025   pmovsxbd(vtmp1, vtmp1);
2026   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2027   pextrb(dst, vtmp1, 0x0);
2028   movsbl(dst, dst);
2029 }
2030 
2031 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2032   pshufd(vtmp1, src2, 0xE);
2033   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2034   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2035 }
2036 
2037 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2038   vextracti128_high(vtmp2, src2);
2039   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2040   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2041 }
2042 
2043 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2044   vextracti64x4_high(vtmp1, src2);
2045   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2046   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2047 }
2048 
2049 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2050   pmovsxbw(vtmp2, src2);
2051   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2052 }
2053 
2054 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2055   if (UseAVX > 1) {
2056     int vector_len = Assembler::AVX_256bit;
2057     vpmovsxbw(vtmp1, src2, vector_len);
2058     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2059   } else {
2060     pmovsxbw(vtmp2, src2);
2061     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2062     pshufd(vtmp2, src2, 0x1);
2063     pmovsxbw(vtmp2, src2);
2064     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2065   }
2066 }
2067 
2068 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2069   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2070     int vector_len = Assembler::AVX_512bit;
2071     vpmovsxbw(vtmp1, src2, vector_len);
2072     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2073   } else {
2074     assert(UseAVX >= 2,"Should not reach here.");
2075     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2076     vextracti128_high(vtmp2, src2);
2077     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2078   }
2079 }
2080 
2081 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2082   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2083   vextracti64x4_high(vtmp2, src2);
2084   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2085 }
2086 
2087 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2088   if (opcode == Op_AddReductionVI) {
2089     if (vtmp1 != src2) {
2090       movdqu(vtmp1, src2);
2091     }
2092     phaddw(vtmp1, vtmp1);
2093     phaddw(vtmp1, vtmp1);
2094   } else {
2095     pshufd(vtmp2, src2, 0x1);
2096     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2097     movdqu(vtmp1, vtmp2);
2098     psrldq(vtmp1, 2);
2099     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2100   }
2101   movdl(vtmp2, src1);
2102   pmovsxwd(vtmp1, vtmp1);
2103   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2104   pextrw(dst, vtmp1, 0x0);
2105   movswl(dst, dst);
2106 }
2107 
2108 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2109   if (opcode == Op_AddReductionVI) {
2110     if (vtmp1 != src2) {
2111       movdqu(vtmp1, src2);
2112     }
2113     phaddw(vtmp1, src2);
2114   } else {
2115     pshufd(vtmp1, src2, 0xE);
2116     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2117   }
2118   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2119 }
2120 
2121 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2122   if (opcode == Op_AddReductionVI) {
2123     int vector_len = Assembler::AVX_256bit;
2124     vphaddw(vtmp2, src2, src2, vector_len);
2125     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2126   } else {
2127     vextracti128_high(vtmp2, src2);
2128     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2129   }
2130   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2131 }
2132 
2133 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2134   int vector_len = Assembler::AVX_256bit;
2135   vextracti64x4_high(vtmp1, src2);
2136   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2137   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2138 }
2139 
2140 #ifdef _LP64
2141 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2142   pshufd(vtmp2, src2, 0xE);
2143   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2144   movdq(vtmp1, src1);
2145   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2146   movdq(dst, vtmp1);
2147 }
2148 
2149 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2150   vextracti128_high(vtmp1, src2);
2151   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2152   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2153 }
2154 
2155 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2156   vextracti64x4_high(vtmp2, src2);
2157   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2158   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2159 }
2160 
2161 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2162   mov64(temp, -1L);
2163   bzhiq(temp, temp, len);
2164   kmovql(dst, temp);
2165 }
2166 #endif // _LP64
2167 
2168 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2169   reduce_operation_128(T_FLOAT, opcode, dst, src);
2170   pshufd(vtmp, src, 0x1);
2171   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2172 }
2173 
2174 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2175   reduce2F(opcode, dst, src, vtmp);
2176   pshufd(vtmp, src, 0x2);
2177   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2178   pshufd(vtmp, src, 0x3);
2179   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2180 }
2181 
2182 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2183   reduce4F(opcode, dst, src, vtmp2);
2184   vextractf128_high(vtmp2, src);
2185   reduce4F(opcode, dst, vtmp2, vtmp1);
2186 }
2187 
2188 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2189   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2190   vextracti64x4_high(vtmp1, src);
2191   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2192 }
2193 
2194 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2195   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2196   pshufd(vtmp, src, 0xE);
2197   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2198 }
2199 
2200 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2201   reduce2D(opcode, dst, src, vtmp2);
2202   vextractf128_high(vtmp2, src);
2203   reduce2D(opcode, dst, vtmp2, vtmp1);
2204 }
2205 
2206 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2207   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2208   vextracti64x4_high(vtmp1, src);
2209   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2210 }
2211 
2212 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2213   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2214 }
2215 
2216 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2217   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2218 }
2219 
2220 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2221                                  int vec_enc) {
2222   switch(elem_bt) {
2223     case T_INT:
2224     case T_FLOAT:
2225       vmaskmovps(dst, src, mask, vec_enc);
2226       break;
2227     case T_LONG:
2228     case T_DOUBLE:
2229       vmaskmovpd(dst, src, mask, vec_enc);
2230       break;
2231     default:
2232       fatal("Unsupported type %s", type2name(elem_bt));
2233       break;
2234   }
2235 }
2236 
2237 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2238                                  int vec_enc) {
2239   switch(elem_bt) {
2240     case T_INT:
2241     case T_FLOAT:
2242       vmaskmovps(dst, src, mask, vec_enc);
2243       break;
2244     case T_LONG:
2245     case T_DOUBLE:
2246       vmaskmovpd(dst, src, mask, vec_enc);
2247       break;
2248     default:
2249       fatal("Unsupported type %s", type2name(elem_bt));
2250       break;
2251   }
2252 }
2253 
2254 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2255                                           XMMRegister dst, XMMRegister src,
2256                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2257                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2258   const int permconst[] = {1, 14};
2259   XMMRegister wsrc = src;
2260   XMMRegister wdst = xmm_0;
2261   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2262 
2263   int vlen_enc = Assembler::AVX_128bit;
2264   if (vlen == 16) {
2265     vlen_enc = Assembler::AVX_256bit;
2266   }
2267 
2268   for (int i = log2(vlen) - 1; i >=0; i--) {
2269     if (i == 0 && !is_dst_valid) {
2270       wdst = dst;
2271     }
2272     if (i == 3) {
2273       vextracti64x4_high(wtmp, wsrc);
2274     } else if (i == 2) {
2275       vextracti128_high(wtmp, wsrc);
2276     } else { // i = [0,1]
2277       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2278     }
2279     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2280     wsrc = wdst;
2281     vlen_enc = Assembler::AVX_128bit;
2282   }
2283   if (is_dst_valid) {
2284     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2285   }
2286 }
2287 
2288 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2289                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2290                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2291   XMMRegister wsrc = src;
2292   XMMRegister wdst = xmm_0;
2293   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2294   int vlen_enc = Assembler::AVX_128bit;
2295   if (vlen == 8) {
2296     vlen_enc = Assembler::AVX_256bit;
2297   }
2298   for (int i = log2(vlen) - 1; i >=0; i--) {
2299     if (i == 0 && !is_dst_valid) {
2300       wdst = dst;
2301     }
2302     if (i == 1) {
2303       vextracti128_high(wtmp, wsrc);
2304     } else if (i == 2) {
2305       vextracti64x4_high(wtmp, wsrc);
2306     } else {
2307       assert(i == 0, "%d", i);
2308       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2309     }
2310     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2311     wsrc = wdst;
2312     vlen_enc = Assembler::AVX_128bit;
2313   }
2314   if (is_dst_valid) {
2315     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2316   }
2317 }
2318 
2319 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2320   switch (bt) {
2321     case T_BYTE:  pextrb(dst, src, idx); break;
2322     case T_SHORT: pextrw(dst, src, idx); break;
2323     case T_INT:   pextrd(dst, src, idx); break;
2324     case T_LONG:  pextrq(dst, src, idx); break;
2325 
2326     default:
2327       assert(false,"Should not reach here.");
2328       break;
2329   }
2330 }
2331 
2332 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2333   int esize =  type2aelembytes(typ);
2334   int elem_per_lane = 16/esize;
2335   int lane = elemindex / elem_per_lane;
2336   int eindex = elemindex % elem_per_lane;
2337 
2338   if (lane >= 2) {
2339     assert(UseAVX > 2, "required");
2340     vextractf32x4(dst, src, lane & 3);
2341     return dst;
2342   } else if (lane > 0) {
2343     assert(UseAVX > 0, "required");
2344     vextractf128(dst, src, lane);
2345     return dst;
2346   } else {
2347     return src;
2348   }
2349 }
2350 
2351 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2352   if (typ == T_BYTE) {
2353     movsbl(dst, dst);
2354   } else if (typ == T_SHORT) {
2355     movswl(dst, dst);
2356   }
2357 }
2358 
2359 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2360   int esize =  type2aelembytes(typ);
2361   int elem_per_lane = 16/esize;
2362   int eindex = elemindex % elem_per_lane;
2363   assert(is_integral_type(typ),"required");
2364 
2365   if (eindex == 0) {
2366     if (typ == T_LONG) {
2367       movq(dst, src);
2368     } else {
2369       movdl(dst, src);
2370       movsxl(typ, dst);
2371     }
2372   } else {
2373     extract(typ, dst, src, eindex);
2374     movsxl(typ, dst);
2375   }
2376 }
2377 
2378 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2379   int esize =  type2aelembytes(typ);
2380   int elem_per_lane = 16/esize;
2381   int eindex = elemindex % elem_per_lane;
2382   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2383 
2384   if (eindex == 0) {
2385     movq(dst, src);
2386   } else {
2387     if (typ == T_FLOAT) {
2388       if (UseAVX == 0) {
2389         movdqu(dst, src);
2390         shufps(dst, dst, eindex);
2391       } else {
2392         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2393       }
2394     } else {
2395       if (UseAVX == 0) {
2396         movdqu(dst, src);
2397         psrldq(dst, eindex*esize);
2398       } else {
2399         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2400       }
2401       movq(dst, dst);
2402     }
2403   }
2404   // Zero upper bits
2405   if (typ == T_FLOAT) {
2406     if (UseAVX == 0) {
2407       assert(vtmp != xnoreg, "required.");
2408       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2409       pand(dst, vtmp);
2410     } else {
2411       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2412     }
2413   }
2414 }
2415 
2416 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2417   switch(typ) {
2418     case T_BYTE:
2419     case T_BOOLEAN:
2420       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2421       break;
2422     case T_SHORT:
2423     case T_CHAR:
2424       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2425       break;
2426     case T_INT:
2427     case T_FLOAT:
2428       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2429       break;
2430     case T_LONG:
2431     case T_DOUBLE:
2432       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2433       break;
2434     default:
2435       assert(false,"Should not reach here.");
2436       break;
2437   }
2438 }
2439 
2440 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2441   assert(rscratch != noreg || always_reachable(src2), "missing");
2442 
2443   switch(typ) {
2444     case T_BOOLEAN:
2445     case T_BYTE:
2446       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2447       break;
2448     case T_CHAR:
2449     case T_SHORT:
2450       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2451       break;
2452     case T_INT:
2453     case T_FLOAT:
2454       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2455       break;
2456     case T_LONG:
2457     case T_DOUBLE:
2458       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2459       break;
2460     default:
2461       assert(false,"Should not reach here.");
2462       break;
2463   }
2464 }
2465 
2466 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2467   switch(typ) {
2468     case T_BYTE:
2469       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2470       break;
2471     case T_SHORT:
2472       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2473       break;
2474     case T_INT:
2475     case T_FLOAT:
2476       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2477       break;
2478     case T_LONG:
2479     case T_DOUBLE:
2480       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2481       break;
2482     default:
2483       assert(false,"Should not reach here.");
2484       break;
2485   }
2486 }
2487 
2488 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2489   assert(vlen_in_bytes <= 32, "");
2490   int esize = type2aelembytes(bt);
2491   if (vlen_in_bytes == 32) {
2492     assert(vtmp == xnoreg, "required.");
2493     if (esize >= 4) {
2494       vtestps(src1, src2, AVX_256bit);
2495     } else {
2496       vptest(src1, src2, AVX_256bit);
2497     }
2498     return;
2499   }
2500   if (vlen_in_bytes < 16) {
2501     // Duplicate the lower part to fill the whole register,
2502     // Don't need to do so for src2
2503     assert(vtmp != xnoreg, "required");
2504     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2505     pshufd(vtmp, src1, shuffle_imm);
2506   } else {
2507     assert(vtmp == xnoreg, "required");
2508     vtmp = src1;
2509   }
2510   if (esize >= 4 && VM_Version::supports_avx()) {
2511     vtestps(vtmp, src2, AVX_128bit);
2512   } else {
2513     ptest(vtmp, src2);
2514   }
2515 }
2516 
2517 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2518   assert(UseAVX >= 2, "required");
2519 #ifdef ASSERT
2520   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2521   bool is_bw_supported = VM_Version::supports_avx512bw();
2522   if (is_bw && !is_bw_supported) {
2523     assert(vlen_enc != Assembler::AVX_512bit, "required");
2524     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2525            "XMM register should be 0-15");
2526   }
2527 #endif // ASSERT
2528   switch (elem_bt) {
2529     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2530     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2531     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2532     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2533     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2534     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2535     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2536   }
2537 }
2538 
2539 #ifdef _LP64
2540 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2541   assert(UseAVX >= 2, "required");
2542   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2543   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2544   if ((UseAVX > 2) &&
2545       (!is_bw || VM_Version::supports_avx512bw()) &&
2546       (!is_vl || VM_Version::supports_avx512vl())) {
2547     switch (elem_bt) {
2548       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2549       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2550       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2551       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2552       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2553     }
2554   } else {
2555     assert(vlen_enc != Assembler::AVX_512bit, "required");
2556     assert((dst->encoding() < 16),"XMM register should be 0-15");
2557     switch (elem_bt) {
2558       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2559       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2560       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2561       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2562       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2563       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2564       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2565     }
2566   }
2567 }
2568 #endif
2569 
2570 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2571   switch (to_elem_bt) {
2572     case T_SHORT:
2573       vpmovsxbw(dst, src, vlen_enc);
2574       break;
2575     case T_INT:
2576       vpmovsxbd(dst, src, vlen_enc);
2577       break;
2578     case T_FLOAT:
2579       vpmovsxbd(dst, src, vlen_enc);
2580       vcvtdq2ps(dst, dst, vlen_enc);
2581       break;
2582     case T_LONG:
2583       vpmovsxbq(dst, src, vlen_enc);
2584       break;
2585     case T_DOUBLE: {
2586       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2587       vpmovsxbd(dst, src, mid_vlen_enc);
2588       vcvtdq2pd(dst, dst, vlen_enc);
2589       break;
2590     }
2591     default:
2592       fatal("Unsupported type %s", type2name(to_elem_bt));
2593       break;
2594   }
2595 }
2596 
2597 //-------------------------------------------------------------------------------------------
2598 
2599 // IndexOf for constant substrings with size >= 8 chars
2600 // which don't need to be loaded through stack.
2601 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2602                                          Register cnt1, Register cnt2,
2603                                          int int_cnt2,  Register result,
2604                                          XMMRegister vec, Register tmp,
2605                                          int ae) {
2606   ShortBranchVerifier sbv(this);
2607   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2608   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2609 
2610   // This method uses the pcmpestri instruction with bound registers
2611   //   inputs:
2612   //     xmm - substring
2613   //     rax - substring length (elements count)
2614   //     mem - scanned string
2615   //     rdx - string length (elements count)
2616   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2617   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2618   //   outputs:
2619   //     rcx - matched index in string
2620   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2621   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2622   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2623   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2624   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2625 
2626   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2627         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2628         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2629 
2630   // Note, inline_string_indexOf() generates checks:
2631   // if (substr.count > string.count) return -1;
2632   // if (substr.count == 0) return 0;
2633   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2634 
2635   // Load substring.
2636   if (ae == StrIntrinsicNode::UL) {
2637     pmovzxbw(vec, Address(str2, 0));
2638   } else {
2639     movdqu(vec, Address(str2, 0));
2640   }
2641   movl(cnt2, int_cnt2);
2642   movptr(result, str1); // string addr
2643 
2644   if (int_cnt2 > stride) {
2645     jmpb(SCAN_TO_SUBSTR);
2646 
2647     // Reload substr for rescan, this code
2648     // is executed only for large substrings (> 8 chars)
2649     bind(RELOAD_SUBSTR);
2650     if (ae == StrIntrinsicNode::UL) {
2651       pmovzxbw(vec, Address(str2, 0));
2652     } else {
2653       movdqu(vec, Address(str2, 0));
2654     }
2655     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2656 
2657     bind(RELOAD_STR);
2658     // We came here after the beginning of the substring was
2659     // matched but the rest of it was not so we need to search
2660     // again. Start from the next element after the previous match.
2661 
2662     // cnt2 is number of substring reminding elements and
2663     // cnt1 is number of string reminding elements when cmp failed.
2664     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2665     subl(cnt1, cnt2);
2666     addl(cnt1, int_cnt2);
2667     movl(cnt2, int_cnt2); // Now restore cnt2
2668 
2669     decrementl(cnt1);     // Shift to next element
2670     cmpl(cnt1, cnt2);
2671     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2672 
2673     addptr(result, (1<<scale1));
2674 
2675   } // (int_cnt2 > 8)
2676 
2677   // Scan string for start of substr in 16-byte vectors
2678   bind(SCAN_TO_SUBSTR);
2679   pcmpestri(vec, Address(result, 0), mode);
2680   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2681   subl(cnt1, stride);
2682   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2683   cmpl(cnt1, cnt2);
2684   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2685   addptr(result, 16);
2686   jmpb(SCAN_TO_SUBSTR);
2687 
2688   // Found a potential substr
2689   bind(FOUND_CANDIDATE);
2690   // Matched whole vector if first element matched (tmp(rcx) == 0).
2691   if (int_cnt2 == stride) {
2692     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2693   } else { // int_cnt2 > 8
2694     jccb(Assembler::overflow, FOUND_SUBSTR);
2695   }
2696   // After pcmpestri tmp(rcx) contains matched element index
2697   // Compute start addr of substr
2698   lea(result, Address(result, tmp, scale1));
2699 
2700   // Make sure string is still long enough
2701   subl(cnt1, tmp);
2702   cmpl(cnt1, cnt2);
2703   if (int_cnt2 == stride) {
2704     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2705   } else { // int_cnt2 > 8
2706     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2707   }
2708   // Left less then substring.
2709 
2710   bind(RET_NOT_FOUND);
2711   movl(result, -1);
2712   jmp(EXIT);
2713 
2714   if (int_cnt2 > stride) {
2715     // This code is optimized for the case when whole substring
2716     // is matched if its head is matched.
2717     bind(MATCH_SUBSTR_HEAD);
2718     pcmpestri(vec, Address(result, 0), mode);
2719     // Reload only string if does not match
2720     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2721 
2722     Label CONT_SCAN_SUBSTR;
2723     // Compare the rest of substring (> 8 chars).
2724     bind(FOUND_SUBSTR);
2725     // First 8 chars are already matched.
2726     negptr(cnt2);
2727     addptr(cnt2, stride);
2728 
2729     bind(SCAN_SUBSTR);
2730     subl(cnt1, stride);
2731     cmpl(cnt2, -stride); // Do not read beyond substring
2732     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2733     // Back-up strings to avoid reading beyond substring:
2734     // cnt1 = cnt1 - cnt2 + 8
2735     addl(cnt1, cnt2); // cnt2 is negative
2736     addl(cnt1, stride);
2737     movl(cnt2, stride); negptr(cnt2);
2738     bind(CONT_SCAN_SUBSTR);
2739     if (int_cnt2 < (int)G) {
2740       int tail_off1 = int_cnt2<<scale1;
2741       int tail_off2 = int_cnt2<<scale2;
2742       if (ae == StrIntrinsicNode::UL) {
2743         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2744       } else {
2745         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2746       }
2747       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2748     } else {
2749       // calculate index in register to avoid integer overflow (int_cnt2*2)
2750       movl(tmp, int_cnt2);
2751       addptr(tmp, cnt2);
2752       if (ae == StrIntrinsicNode::UL) {
2753         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2754       } else {
2755         movdqu(vec, Address(str2, tmp, scale2, 0));
2756       }
2757       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2758     }
2759     // Need to reload strings pointers if not matched whole vector
2760     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2761     addptr(cnt2, stride);
2762     jcc(Assembler::negative, SCAN_SUBSTR);
2763     // Fall through if found full substring
2764 
2765   } // (int_cnt2 > 8)
2766 
2767   bind(RET_FOUND);
2768   // Found result if we matched full small substring.
2769   // Compute substr offset
2770   subptr(result, str1);
2771   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2772     shrl(result, 1); // index
2773   }
2774   bind(EXIT);
2775 
2776 } // string_indexofC8
2777 
2778 // Small strings are loaded through stack if they cross page boundary.
2779 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2780                                        Register cnt1, Register cnt2,
2781                                        int int_cnt2,  Register result,
2782                                        XMMRegister vec, Register tmp,
2783                                        int ae) {
2784   ShortBranchVerifier sbv(this);
2785   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2786   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2787 
2788   //
2789   // int_cnt2 is length of small (< 8 chars) constant substring
2790   // or (-1) for non constant substring in which case its length
2791   // is in cnt2 register.
2792   //
2793   // Note, inline_string_indexOf() generates checks:
2794   // if (substr.count > string.count) return -1;
2795   // if (substr.count == 0) return 0;
2796   //
2797   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2798   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2799   // This method uses the pcmpestri instruction with bound registers
2800   //   inputs:
2801   //     xmm - substring
2802   //     rax - substring length (elements count)
2803   //     mem - scanned string
2804   //     rdx - string length (elements count)
2805   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2806   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2807   //   outputs:
2808   //     rcx - matched index in string
2809   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2810   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2811   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2812   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2813 
2814   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2815         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2816         FOUND_CANDIDATE;
2817 
2818   { //========================================================
2819     // We don't know where these strings are located
2820     // and we can't read beyond them. Load them through stack.
2821     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2822 
2823     movptr(tmp, rsp); // save old SP
2824 
2825     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2826       if (int_cnt2 == (1>>scale2)) { // One byte
2827         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2828         load_unsigned_byte(result, Address(str2, 0));
2829         movdl(vec, result); // move 32 bits
2830       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2831         // Not enough header space in 32-bit VM: 12+3 = 15.
2832         movl(result, Address(str2, -1));
2833         shrl(result, 8);
2834         movdl(vec, result); // move 32 bits
2835       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2836         load_unsigned_short(result, Address(str2, 0));
2837         movdl(vec, result); // move 32 bits
2838       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2839         movdl(vec, Address(str2, 0)); // move 32 bits
2840       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2841         movq(vec, Address(str2, 0));  // move 64 bits
2842       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2843         // Array header size is 12 bytes in 32-bit VM
2844         // + 6 bytes for 3 chars == 18 bytes,
2845         // enough space to load vec and shift.
2846         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2847         if (ae == StrIntrinsicNode::UL) {
2848           int tail_off = int_cnt2-8;
2849           pmovzxbw(vec, Address(str2, tail_off));
2850           psrldq(vec, -2*tail_off);
2851         }
2852         else {
2853           int tail_off = int_cnt2*(1<<scale2);
2854           movdqu(vec, Address(str2, tail_off-16));
2855           psrldq(vec, 16-tail_off);
2856         }
2857       }
2858     } else { // not constant substring
2859       cmpl(cnt2, stride);
2860       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2861 
2862       // We can read beyond string if srt+16 does not cross page boundary
2863       // since heaps are aligned and mapped by pages.
2864       assert(os::vm_page_size() < (int)G, "default page should be small");
2865       movl(result, str2); // We need only low 32 bits
2866       andl(result, ((int)os::vm_page_size()-1));
2867       cmpl(result, ((int)os::vm_page_size()-16));
2868       jccb(Assembler::belowEqual, CHECK_STR);
2869 
2870       // Move small strings to stack to allow load 16 bytes into vec.
2871       subptr(rsp, 16);
2872       int stk_offset = wordSize-(1<<scale2);
2873       push(cnt2);
2874 
2875       bind(COPY_SUBSTR);
2876       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2877         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2878         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2879       } else if (ae == StrIntrinsicNode::UU) {
2880         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2881         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2882       }
2883       decrement(cnt2);
2884       jccb(Assembler::notZero, COPY_SUBSTR);
2885 
2886       pop(cnt2);
2887       movptr(str2, rsp);  // New substring address
2888     } // non constant
2889 
2890     bind(CHECK_STR);
2891     cmpl(cnt1, stride);
2892     jccb(Assembler::aboveEqual, BIG_STRINGS);
2893 
2894     // Check cross page boundary.
2895     movl(result, str1); // We need only low 32 bits
2896     andl(result, ((int)os::vm_page_size()-1));
2897     cmpl(result, ((int)os::vm_page_size()-16));
2898     jccb(Assembler::belowEqual, BIG_STRINGS);
2899 
2900     subptr(rsp, 16);
2901     int stk_offset = -(1<<scale1);
2902     if (int_cnt2 < 0) { // not constant
2903       push(cnt2);
2904       stk_offset += wordSize;
2905     }
2906     movl(cnt2, cnt1);
2907 
2908     bind(COPY_STR);
2909     if (ae == StrIntrinsicNode::LL) {
2910       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2911       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2912     } else {
2913       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2914       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2915     }
2916     decrement(cnt2);
2917     jccb(Assembler::notZero, COPY_STR);
2918 
2919     if (int_cnt2 < 0) { // not constant
2920       pop(cnt2);
2921     }
2922     movptr(str1, rsp);  // New string address
2923 
2924     bind(BIG_STRINGS);
2925     // Load substring.
2926     if (int_cnt2 < 0) { // -1
2927       if (ae == StrIntrinsicNode::UL) {
2928         pmovzxbw(vec, Address(str2, 0));
2929       } else {
2930         movdqu(vec, Address(str2, 0));
2931       }
2932       push(cnt2);       // substr count
2933       push(str2);       // substr addr
2934       push(str1);       // string addr
2935     } else {
2936       // Small (< 8 chars) constant substrings are loaded already.
2937       movl(cnt2, int_cnt2);
2938     }
2939     push(tmp);  // original SP
2940 
2941   } // Finished loading
2942 
2943   //========================================================
2944   // Start search
2945   //
2946 
2947   movptr(result, str1); // string addr
2948 
2949   if (int_cnt2  < 0) {  // Only for non constant substring
2950     jmpb(SCAN_TO_SUBSTR);
2951 
2952     // SP saved at sp+0
2953     // String saved at sp+1*wordSize
2954     // Substr saved at sp+2*wordSize
2955     // Substr count saved at sp+3*wordSize
2956 
2957     // Reload substr for rescan, this code
2958     // is executed only for large substrings (> 8 chars)
2959     bind(RELOAD_SUBSTR);
2960     movptr(str2, Address(rsp, 2*wordSize));
2961     movl(cnt2, Address(rsp, 3*wordSize));
2962     if (ae == StrIntrinsicNode::UL) {
2963       pmovzxbw(vec, Address(str2, 0));
2964     } else {
2965       movdqu(vec, Address(str2, 0));
2966     }
2967     // We came here after the beginning of the substring was
2968     // matched but the rest of it was not so we need to search
2969     // again. Start from the next element after the previous match.
2970     subptr(str1, result); // Restore counter
2971     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2972       shrl(str1, 1);
2973     }
2974     addl(cnt1, str1);
2975     decrementl(cnt1);   // Shift to next element
2976     cmpl(cnt1, cnt2);
2977     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2978 
2979     addptr(result, (1<<scale1));
2980   } // non constant
2981 
2982   // Scan string for start of substr in 16-byte vectors
2983   bind(SCAN_TO_SUBSTR);
2984   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2985   pcmpestri(vec, Address(result, 0), mode);
2986   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2987   subl(cnt1, stride);
2988   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2989   cmpl(cnt1, cnt2);
2990   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2991   addptr(result, 16);
2992 
2993   bind(ADJUST_STR);
2994   cmpl(cnt1, stride); // Do not read beyond string
2995   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2996   // Back-up string to avoid reading beyond string.
2997   lea(result, Address(result, cnt1, scale1, -16));
2998   movl(cnt1, stride);
2999   jmpb(SCAN_TO_SUBSTR);
3000 
3001   // Found a potential substr
3002   bind(FOUND_CANDIDATE);
3003   // After pcmpestri tmp(rcx) contains matched element index
3004 
3005   // Make sure string is still long enough
3006   subl(cnt1, tmp);
3007   cmpl(cnt1, cnt2);
3008   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3009   // Left less then substring.
3010 
3011   bind(RET_NOT_FOUND);
3012   movl(result, -1);
3013   jmp(CLEANUP);
3014 
3015   bind(FOUND_SUBSTR);
3016   // Compute start addr of substr
3017   lea(result, Address(result, tmp, scale1));
3018   if (int_cnt2 > 0) { // Constant substring
3019     // Repeat search for small substring (< 8 chars)
3020     // from new point without reloading substring.
3021     // Have to check that we don't read beyond string.
3022     cmpl(tmp, stride-int_cnt2);
3023     jccb(Assembler::greater, ADJUST_STR);
3024     // Fall through if matched whole substring.
3025   } else { // non constant
3026     assert(int_cnt2 == -1, "should be != 0");
3027 
3028     addl(tmp, cnt2);
3029     // Found result if we matched whole substring.
3030     cmpl(tmp, stride);
3031     jcc(Assembler::lessEqual, RET_FOUND);
3032 
3033     // Repeat search for small substring (<= 8 chars)
3034     // from new point 'str1' without reloading substring.
3035     cmpl(cnt2, stride);
3036     // Have to check that we don't read beyond string.
3037     jccb(Assembler::lessEqual, ADJUST_STR);
3038 
3039     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3040     // Compare the rest of substring (> 8 chars).
3041     movptr(str1, result);
3042 
3043     cmpl(tmp, cnt2);
3044     // First 8 chars are already matched.
3045     jccb(Assembler::equal, CHECK_NEXT);
3046 
3047     bind(SCAN_SUBSTR);
3048     pcmpestri(vec, Address(str1, 0), mode);
3049     // Need to reload strings pointers if not matched whole vector
3050     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3051 
3052     bind(CHECK_NEXT);
3053     subl(cnt2, stride);
3054     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3055     addptr(str1, 16);
3056     if (ae == StrIntrinsicNode::UL) {
3057       addptr(str2, 8);
3058     } else {
3059       addptr(str2, 16);
3060     }
3061     subl(cnt1, stride);
3062     cmpl(cnt2, stride); // Do not read beyond substring
3063     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3064     // Back-up strings to avoid reading beyond substring.
3065 
3066     if (ae == StrIntrinsicNode::UL) {
3067       lea(str2, Address(str2, cnt2, scale2, -8));
3068       lea(str1, Address(str1, cnt2, scale1, -16));
3069     } else {
3070       lea(str2, Address(str2, cnt2, scale2, -16));
3071       lea(str1, Address(str1, cnt2, scale1, -16));
3072     }
3073     subl(cnt1, cnt2);
3074     movl(cnt2, stride);
3075     addl(cnt1, stride);
3076     bind(CONT_SCAN_SUBSTR);
3077     if (ae == StrIntrinsicNode::UL) {
3078       pmovzxbw(vec, Address(str2, 0));
3079     } else {
3080       movdqu(vec, Address(str2, 0));
3081     }
3082     jmp(SCAN_SUBSTR);
3083 
3084     bind(RET_FOUND_LONG);
3085     movptr(str1, Address(rsp, wordSize));
3086   } // non constant
3087 
3088   bind(RET_FOUND);
3089   // Compute substr offset
3090   subptr(result, str1);
3091   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3092     shrl(result, 1); // index
3093   }
3094   bind(CLEANUP);
3095   pop(rsp); // restore SP
3096 
3097 } // string_indexof
3098 
3099 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3100                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3101   ShortBranchVerifier sbv(this);
3102   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3103 
3104   int stride = 8;
3105 
3106   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3107         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3108         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3109         FOUND_SEQ_CHAR, DONE_LABEL;
3110 
3111   movptr(result, str1);
3112   if (UseAVX >= 2) {
3113     cmpl(cnt1, stride);
3114     jcc(Assembler::less, SCAN_TO_CHAR);
3115     cmpl(cnt1, 2*stride);
3116     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3117     movdl(vec1, ch);
3118     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3119     vpxor(vec2, vec2);
3120     movl(tmp, cnt1);
3121     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3122     andl(cnt1,0x0000000F);  //tail count (in chars)
3123 
3124     bind(SCAN_TO_16_CHAR_LOOP);
3125     vmovdqu(vec3, Address(result, 0));
3126     vpcmpeqw(vec3, vec3, vec1, 1);
3127     vptest(vec2, vec3);
3128     jcc(Assembler::carryClear, FOUND_CHAR);
3129     addptr(result, 32);
3130     subl(tmp, 2*stride);
3131     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3132     jmp(SCAN_TO_8_CHAR);
3133     bind(SCAN_TO_8_CHAR_INIT);
3134     movdl(vec1, ch);
3135     pshuflw(vec1, vec1, 0x00);
3136     pshufd(vec1, vec1, 0);
3137     pxor(vec2, vec2);
3138   }
3139   bind(SCAN_TO_8_CHAR);
3140   cmpl(cnt1, stride);
3141   jcc(Assembler::less, SCAN_TO_CHAR);
3142   if (UseAVX < 2) {
3143     movdl(vec1, ch);
3144     pshuflw(vec1, vec1, 0x00);
3145     pshufd(vec1, vec1, 0);
3146     pxor(vec2, vec2);
3147   }
3148   movl(tmp, cnt1);
3149   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3150   andl(cnt1,0x00000007);  //tail count (in chars)
3151 
3152   bind(SCAN_TO_8_CHAR_LOOP);
3153   movdqu(vec3, Address(result, 0));
3154   pcmpeqw(vec3, vec1);
3155   ptest(vec2, vec3);
3156   jcc(Assembler::carryClear, FOUND_CHAR);
3157   addptr(result, 16);
3158   subl(tmp, stride);
3159   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3160   bind(SCAN_TO_CHAR);
3161   testl(cnt1, cnt1);
3162   jcc(Assembler::zero, RET_NOT_FOUND);
3163   bind(SCAN_TO_CHAR_LOOP);
3164   load_unsigned_short(tmp, Address(result, 0));
3165   cmpl(ch, tmp);
3166   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3167   addptr(result, 2);
3168   subl(cnt1, 1);
3169   jccb(Assembler::zero, RET_NOT_FOUND);
3170   jmp(SCAN_TO_CHAR_LOOP);
3171 
3172   bind(RET_NOT_FOUND);
3173   movl(result, -1);
3174   jmpb(DONE_LABEL);
3175 
3176   bind(FOUND_CHAR);
3177   if (UseAVX >= 2) {
3178     vpmovmskb(tmp, vec3);
3179   } else {
3180     pmovmskb(tmp, vec3);
3181   }
3182   bsfl(ch, tmp);
3183   addptr(result, ch);
3184 
3185   bind(FOUND_SEQ_CHAR);
3186   subptr(result, str1);
3187   shrl(result, 1);
3188 
3189   bind(DONE_LABEL);
3190 } // string_indexof_char
3191 
3192 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3193                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3194   ShortBranchVerifier sbv(this);
3195   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3196 
3197   int stride = 16;
3198 
3199   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3200         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3201         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3202         FOUND_SEQ_CHAR, DONE_LABEL;
3203 
3204   movptr(result, str1);
3205   if (UseAVX >= 2) {
3206     cmpl(cnt1, stride);
3207     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3208     cmpl(cnt1, stride*2);
3209     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3210     movdl(vec1, ch);
3211     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3212     vpxor(vec2, vec2);
3213     movl(tmp, cnt1);
3214     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3215     andl(cnt1,0x0000001F);  //tail count (in chars)
3216 
3217     bind(SCAN_TO_32_CHAR_LOOP);
3218     vmovdqu(vec3, Address(result, 0));
3219     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3220     vptest(vec2, vec3);
3221     jcc(Assembler::carryClear, FOUND_CHAR);
3222     addptr(result, 32);
3223     subl(tmp, stride*2);
3224     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3225     jmp(SCAN_TO_16_CHAR);
3226 
3227     bind(SCAN_TO_16_CHAR_INIT);
3228     movdl(vec1, ch);
3229     pxor(vec2, vec2);
3230     pshufb(vec1, vec2);
3231   }
3232 
3233   bind(SCAN_TO_16_CHAR);
3234   cmpl(cnt1, stride);
3235   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3236   if (UseAVX < 2) {
3237     movdl(vec1, ch);
3238     pxor(vec2, vec2);
3239     pshufb(vec1, vec2);
3240   }
3241   movl(tmp, cnt1);
3242   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3243   andl(cnt1,0x0000000F);  //tail count (in bytes)
3244 
3245   bind(SCAN_TO_16_CHAR_LOOP);
3246   movdqu(vec3, Address(result, 0));
3247   pcmpeqb(vec3, vec1);
3248   ptest(vec2, vec3);
3249   jcc(Assembler::carryClear, FOUND_CHAR);
3250   addptr(result, 16);
3251   subl(tmp, stride);
3252   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3253 
3254   bind(SCAN_TO_CHAR_INIT);
3255   testl(cnt1, cnt1);
3256   jcc(Assembler::zero, RET_NOT_FOUND);
3257   bind(SCAN_TO_CHAR_LOOP);
3258   load_unsigned_byte(tmp, Address(result, 0));
3259   cmpl(ch, tmp);
3260   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3261   addptr(result, 1);
3262   subl(cnt1, 1);
3263   jccb(Assembler::zero, RET_NOT_FOUND);
3264   jmp(SCAN_TO_CHAR_LOOP);
3265 
3266   bind(RET_NOT_FOUND);
3267   movl(result, -1);
3268   jmpb(DONE_LABEL);
3269 
3270   bind(FOUND_CHAR);
3271   if (UseAVX >= 2) {
3272     vpmovmskb(tmp, vec3);
3273   } else {
3274     pmovmskb(tmp, vec3);
3275   }
3276   bsfl(ch, tmp);
3277   addptr(result, ch);
3278 
3279   bind(FOUND_SEQ_CHAR);
3280   subptr(result, str1);
3281 
3282   bind(DONE_LABEL);
3283 } // stringL_indexof_char
3284 
3285 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3286   switch (eltype) {
3287   case T_BOOLEAN: return sizeof(jboolean);
3288   case T_BYTE:  return sizeof(jbyte);
3289   case T_SHORT: return sizeof(jshort);
3290   case T_CHAR:  return sizeof(jchar);
3291   case T_INT:   return sizeof(jint);
3292   default:
3293     ShouldNotReachHere();
3294     return -1;
3295   }
3296 }
3297 
3298 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3299   switch (eltype) {
3300   // T_BOOLEAN used as surrogate for unsigned byte
3301   case T_BOOLEAN: movzbl(dst, src);   break;
3302   case T_BYTE:    movsbl(dst, src);   break;
3303   case T_SHORT:   movswl(dst, src);   break;
3304   case T_CHAR:    movzwl(dst, src);   break;
3305   case T_INT:     movl(dst, src);     break;
3306   default:
3307     ShouldNotReachHere();
3308   }
3309 }
3310 
3311 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3312   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3313 }
3314 
3315 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3316   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3317 }
3318 
3319 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3320   const int vlen = Assembler::AVX_256bit;
3321   switch (eltype) {
3322   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3323   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3324   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3325   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3326   case T_INT:
3327     // do nothing
3328     break;
3329   default:
3330     ShouldNotReachHere();
3331   }
3332 }
3333 
3334 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3335                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3336                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3337                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3338                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3339                                         BasicType eltype) {
3340   ShortBranchVerifier sbv(this);
3341   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3342   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3343   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3344 
3345   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3346         SHORT_UNROLLED_LOOP_EXIT,
3347         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3348         UNROLLED_VECTOR_LOOP_BEGIN,
3349         END;
3350   switch (eltype) {
3351   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3352   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3353   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3354   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3355   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3356   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3357   }
3358 
3359   // For "renaming" for readibility of the code
3360   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3361                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3362                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3363 
3364   const int elsize = arrays_hashcode_elsize(eltype);
3365 
3366   /*
3367     if (cnt1 >= 2) {
3368       if (cnt1 >= 32) {
3369         UNROLLED VECTOR LOOP
3370       }
3371       UNROLLED SCALAR LOOP
3372     }
3373     SINGLE SCALAR
3374    */
3375 
3376   cmpl(cnt1, 32);
3377   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3378 
3379   // cnt1 >= 32 && generate_vectorized_loop
3380   xorl(index, index);
3381 
3382   // vresult = IntVector.zero(I256);
3383   for (int idx = 0; idx < 4; idx++) {
3384     vpxor(vresult[idx], vresult[idx]);
3385   }
3386   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3387   Register bound = tmp2;
3388   Register next = tmp3;
3389   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3390   movl(next, Address(tmp2, 0));
3391   movdl(vnext, next);
3392   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3393 
3394   // index = 0;
3395   // bound = cnt1 & ~(32 - 1);
3396   movl(bound, cnt1);
3397   andl(bound, ~(32 - 1));
3398   // for (; index < bound; index += 32) {
3399   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3400   // result *= next;
3401   imull(result, next);
3402   // loop fission to upfront the cost of fetching from memory, OOO execution
3403   // can then hopefully do a better job of prefetching
3404   for (int idx = 0; idx < 4; idx++) {
3405     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3406   }
3407   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3408   for (int idx = 0; idx < 4; idx++) {
3409     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3410     arrays_hashcode_elvcast(vtmp[idx], eltype);
3411     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3412   }
3413   // index += 32;
3414   addl(index, 32);
3415   // index < bound;
3416   cmpl(index, bound);
3417   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3418   // }
3419 
3420   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3421   subl(cnt1, bound);
3422   // release bound
3423 
3424   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3425   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3426   for (int idx = 0; idx < 4; idx++) {
3427     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, (int)((8 * idx + 1) * sizeof(jint))), T_INT);
3428     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3429   }
3430   // result += vresult.reduceLanes(ADD);
3431   for (int idx = 0; idx < 4; idx++) {
3432     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3433   }
3434 
3435   // } else if (cnt1 < 32) {
3436 
3437   bind(SHORT_UNROLLED_BEGIN);
3438   // int i = 1;
3439   movl(index, 1);
3440   cmpl(index, cnt1);
3441   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3442 
3443   // for (; i < cnt1 ; i += 2) {
3444   bind(SHORT_UNROLLED_LOOP_BEGIN);
3445   movl(tmp3, 961);
3446   imull(result, tmp3);
3447   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3448   movl(tmp3, tmp2);
3449   shll(tmp3, 5);
3450   subl(tmp3, tmp2);
3451   addl(result, tmp3);
3452   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3453   addl(result, tmp3);
3454   addl(index, 2);
3455   cmpl(index, cnt1);
3456   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3457 
3458   // }
3459   // if (i >= cnt1) {
3460   bind(SHORT_UNROLLED_LOOP_EXIT);
3461   jccb(Assembler::greater, END);
3462   movl(tmp2, result);
3463   shll(result, 5);
3464   subl(result, tmp2);
3465   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3466   addl(result, tmp3);
3467   // }
3468   bind(END);
3469 
3470   BLOCK_COMMENT("} // arrays_hashcode");
3471 
3472 } // arrays_hashcode
3473 
3474 // helper function for string_compare
3475 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3476                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3477                                            Address::ScaleFactor scale2, Register index, int ae) {
3478   if (ae == StrIntrinsicNode::LL) {
3479     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3480     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3481   } else if (ae == StrIntrinsicNode::UU) {
3482     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3483     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3484   } else {
3485     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3486     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3487   }
3488 }
3489 
3490 // Compare strings, used for char[] and byte[].
3491 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3492                                        Register cnt1, Register cnt2, Register result,
3493                                        XMMRegister vec1, int ae, KRegister mask) {
3494   ShortBranchVerifier sbv(this);
3495   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3496   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3497   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3498   int stride2x2 = 0x40;
3499   Address::ScaleFactor scale = Address::no_scale;
3500   Address::ScaleFactor scale1 = Address::no_scale;
3501   Address::ScaleFactor scale2 = Address::no_scale;
3502 
3503   if (ae != StrIntrinsicNode::LL) {
3504     stride2x2 = 0x20;
3505   }
3506 
3507   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3508     shrl(cnt2, 1);
3509   }
3510   // Compute the minimum of the string lengths and the
3511   // difference of the string lengths (stack).
3512   // Do the conditional move stuff
3513   movl(result, cnt1);
3514   subl(cnt1, cnt2);
3515   push(cnt1);
3516   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3517 
3518   // Is the minimum length zero?
3519   testl(cnt2, cnt2);
3520   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3521   if (ae == StrIntrinsicNode::LL) {
3522     // Load first bytes
3523     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3524     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3525   } else if (ae == StrIntrinsicNode::UU) {
3526     // Load first characters
3527     load_unsigned_short(result, Address(str1, 0));
3528     load_unsigned_short(cnt1, Address(str2, 0));
3529   } else {
3530     load_unsigned_byte(result, Address(str1, 0));
3531     load_unsigned_short(cnt1, Address(str2, 0));
3532   }
3533   subl(result, cnt1);
3534   jcc(Assembler::notZero,  POP_LABEL);
3535 
3536   if (ae == StrIntrinsicNode::UU) {
3537     // Divide length by 2 to get number of chars
3538     shrl(cnt2, 1);
3539   }
3540   cmpl(cnt2, 1);
3541   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3542 
3543   // Check if the strings start at the same location and setup scale and stride
3544   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3545     cmpptr(str1, str2);
3546     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3547     if (ae == StrIntrinsicNode::LL) {
3548       scale = Address::times_1;
3549       stride = 16;
3550     } else {
3551       scale = Address::times_2;
3552       stride = 8;
3553     }
3554   } else {
3555     scale1 = Address::times_1;
3556     scale2 = Address::times_2;
3557     // scale not used
3558     stride = 8;
3559   }
3560 
3561   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3562     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3563     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3564     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3565     Label COMPARE_TAIL_LONG;
3566     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3567 
3568     int pcmpmask = 0x19;
3569     if (ae == StrIntrinsicNode::LL) {
3570       pcmpmask &= ~0x01;
3571     }
3572 
3573     // Setup to compare 16-chars (32-bytes) vectors,
3574     // start from first character again because it has aligned address.
3575     if (ae == StrIntrinsicNode::LL) {
3576       stride2 = 32;
3577     } else {
3578       stride2 = 16;
3579     }
3580     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3581       adr_stride = stride << scale;
3582     } else {
3583       adr_stride1 = 8;  //stride << scale1;
3584       adr_stride2 = 16; //stride << scale2;
3585     }
3586 
3587     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3588     // rax and rdx are used by pcmpestri as elements counters
3589     movl(result, cnt2);
3590     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3591     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3592 
3593     // fast path : compare first 2 8-char vectors.
3594     bind(COMPARE_16_CHARS);
3595     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3596       movdqu(vec1, Address(str1, 0));
3597     } else {
3598       pmovzxbw(vec1, Address(str1, 0));
3599     }
3600     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3601     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3602 
3603     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3604       movdqu(vec1, Address(str1, adr_stride));
3605       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3606     } else {
3607       pmovzxbw(vec1, Address(str1, adr_stride1));
3608       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3609     }
3610     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3611     addl(cnt1, stride);
3612 
3613     // Compare the characters at index in cnt1
3614     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3615     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3616     subl(result, cnt2);
3617     jmp(POP_LABEL);
3618 
3619     // Setup the registers to start vector comparison loop
3620     bind(COMPARE_WIDE_VECTORS);
3621     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3622       lea(str1, Address(str1, result, scale));
3623       lea(str2, Address(str2, result, scale));
3624     } else {
3625       lea(str1, Address(str1, result, scale1));
3626       lea(str2, Address(str2, result, scale2));
3627     }
3628     subl(result, stride2);
3629     subl(cnt2, stride2);
3630     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3631     negptr(result);
3632 
3633     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3634     bind(COMPARE_WIDE_VECTORS_LOOP);
3635 
3636 #ifdef _LP64
3637     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3638       cmpl(cnt2, stride2x2);
3639       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3640       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3641       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3642 
3643       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3644       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3645         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3646         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3647       } else {
3648         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3649         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3650       }
3651       kortestql(mask, mask);
3652       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3653       addptr(result, stride2x2);  // update since we already compared at this addr
3654       subl(cnt2, stride2x2);      // and sub the size too
3655       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3656 
3657       vpxor(vec1, vec1);
3658       jmpb(COMPARE_WIDE_TAIL);
3659     }//if (VM_Version::supports_avx512vlbw())
3660 #endif // _LP64
3661 
3662 
3663     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3664     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3665       vmovdqu(vec1, Address(str1, result, scale));
3666       vpxor(vec1, Address(str2, result, scale));
3667     } else {
3668       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3669       vpxor(vec1, Address(str2, result, scale2));
3670     }
3671     vptest(vec1, vec1);
3672     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3673     addptr(result, stride2);
3674     subl(cnt2, stride2);
3675     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3676     // clean upper bits of YMM registers
3677     vpxor(vec1, vec1);
3678 
3679     // compare wide vectors tail
3680     bind(COMPARE_WIDE_TAIL);
3681     testptr(result, result);
3682     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3683 
3684     movl(result, stride2);
3685     movl(cnt2, result);
3686     negptr(result);
3687     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3688 
3689     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3690     bind(VECTOR_NOT_EQUAL);
3691     // clean upper bits of YMM registers
3692     vpxor(vec1, vec1);
3693     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3694       lea(str1, Address(str1, result, scale));
3695       lea(str2, Address(str2, result, scale));
3696     } else {
3697       lea(str1, Address(str1, result, scale1));
3698       lea(str2, Address(str2, result, scale2));
3699     }
3700     jmp(COMPARE_16_CHARS);
3701 
3702     // Compare tail chars, length between 1 to 15 chars
3703     bind(COMPARE_TAIL_LONG);
3704     movl(cnt2, result);
3705     cmpl(cnt2, stride);
3706     jcc(Assembler::less, COMPARE_SMALL_STR);
3707 
3708     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3709       movdqu(vec1, Address(str1, 0));
3710     } else {
3711       pmovzxbw(vec1, Address(str1, 0));
3712     }
3713     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3714     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3715     subptr(cnt2, stride);
3716     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3717     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3718       lea(str1, Address(str1, result, scale));
3719       lea(str2, Address(str2, result, scale));
3720     } else {
3721       lea(str1, Address(str1, result, scale1));
3722       lea(str2, Address(str2, result, scale2));
3723     }
3724     negptr(cnt2);
3725     jmpb(WHILE_HEAD_LABEL);
3726 
3727     bind(COMPARE_SMALL_STR);
3728   } else if (UseSSE42Intrinsics) {
3729     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3730     int pcmpmask = 0x19;
3731     // Setup to compare 8-char (16-byte) vectors,
3732     // start from first character again because it has aligned address.
3733     movl(result, cnt2);
3734     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3735     if (ae == StrIntrinsicNode::LL) {
3736       pcmpmask &= ~0x01;
3737     }
3738     jcc(Assembler::zero, COMPARE_TAIL);
3739     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3740       lea(str1, Address(str1, result, scale));
3741       lea(str2, Address(str2, result, scale));
3742     } else {
3743       lea(str1, Address(str1, result, scale1));
3744       lea(str2, Address(str2, result, scale2));
3745     }
3746     negptr(result);
3747 
3748     // pcmpestri
3749     //   inputs:
3750     //     vec1- substring
3751     //     rax - negative string length (elements count)
3752     //     mem - scanned string
3753     //     rdx - string length (elements count)
3754     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3755     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3756     //   outputs:
3757     //     rcx - first mismatched element index
3758     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3759 
3760     bind(COMPARE_WIDE_VECTORS);
3761     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3762       movdqu(vec1, Address(str1, result, scale));
3763       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3764     } else {
3765       pmovzxbw(vec1, Address(str1, result, scale1));
3766       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3767     }
3768     // After pcmpestri cnt1(rcx) contains mismatched element index
3769 
3770     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3771     addptr(result, stride);
3772     subptr(cnt2, stride);
3773     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3774 
3775     // compare wide vectors tail
3776     testptr(result, result);
3777     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3778 
3779     movl(cnt2, stride);
3780     movl(result, stride);
3781     negptr(result);
3782     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3783       movdqu(vec1, Address(str1, result, scale));
3784       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3785     } else {
3786       pmovzxbw(vec1, Address(str1, result, scale1));
3787       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3788     }
3789     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3790 
3791     // Mismatched characters in the vectors
3792     bind(VECTOR_NOT_EQUAL);
3793     addptr(cnt1, result);
3794     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3795     subl(result, cnt2);
3796     jmpb(POP_LABEL);
3797 
3798     bind(COMPARE_TAIL); // limit is zero
3799     movl(cnt2, result);
3800     // Fallthru to tail compare
3801   }
3802   // Shift str2 and str1 to the end of the arrays, negate min
3803   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3804     lea(str1, Address(str1, cnt2, scale));
3805     lea(str2, Address(str2, cnt2, scale));
3806   } else {
3807     lea(str1, Address(str1, cnt2, scale1));
3808     lea(str2, Address(str2, cnt2, scale2));
3809   }
3810   decrementl(cnt2);  // first character was compared already
3811   negptr(cnt2);
3812 
3813   // Compare the rest of the elements
3814   bind(WHILE_HEAD_LABEL);
3815   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3816   subl(result, cnt1);
3817   jccb(Assembler::notZero, POP_LABEL);
3818   increment(cnt2);
3819   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3820 
3821   // Strings are equal up to min length.  Return the length difference.
3822   bind(LENGTH_DIFF_LABEL);
3823   pop(result);
3824   if (ae == StrIntrinsicNode::UU) {
3825     // Divide diff by 2 to get number of chars
3826     sarl(result, 1);
3827   }
3828   jmpb(DONE_LABEL);
3829 
3830 #ifdef _LP64
3831   if (VM_Version::supports_avx512vlbw()) {
3832 
3833     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3834 
3835     kmovql(cnt1, mask);
3836     notq(cnt1);
3837     bsfq(cnt2, cnt1);
3838     if (ae != StrIntrinsicNode::LL) {
3839       // Divide diff by 2 to get number of chars
3840       sarl(cnt2, 1);
3841     }
3842     addq(result, cnt2);
3843     if (ae == StrIntrinsicNode::LL) {
3844       load_unsigned_byte(cnt1, Address(str2, result));
3845       load_unsigned_byte(result, Address(str1, result));
3846     } else if (ae == StrIntrinsicNode::UU) {
3847       load_unsigned_short(cnt1, Address(str2, result, scale));
3848       load_unsigned_short(result, Address(str1, result, scale));
3849     } else {
3850       load_unsigned_short(cnt1, Address(str2, result, scale2));
3851       load_unsigned_byte(result, Address(str1, result, scale1));
3852     }
3853     subl(result, cnt1);
3854     jmpb(POP_LABEL);
3855   }//if (VM_Version::supports_avx512vlbw())
3856 #endif // _LP64
3857 
3858   // Discard the stored length difference
3859   bind(POP_LABEL);
3860   pop(cnt1);
3861 
3862   // That's it
3863   bind(DONE_LABEL);
3864   if(ae == StrIntrinsicNode::UL) {
3865     negl(result);
3866   }
3867 
3868 }
3869 
3870 // Search for Non-ASCII character (Negative byte value) in a byte array,
3871 // return the index of the first such character, otherwise the length
3872 // of the array segment searched.
3873 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3874 //   @IntrinsicCandidate
3875 //   public static int countPositives(byte[] ba, int off, int len) {
3876 //     for (int i = off; i < off + len; i++) {
3877 //       if (ba[i] < 0) {
3878 //         return i - off;
3879 //       }
3880 //     }
3881 //     return len;
3882 //   }
3883 void C2_MacroAssembler::count_positives(Register ary1, Register len,
3884   Register result, Register tmp1,
3885   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3886   // rsi: byte array
3887   // rcx: len
3888   // rax: result
3889   ShortBranchVerifier sbv(this);
3890   assert_different_registers(ary1, len, result, tmp1);
3891   assert_different_registers(vec1, vec2);
3892   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3893 
3894   movl(result, len); // copy
3895   // len == 0
3896   testl(len, len);
3897   jcc(Assembler::zero, DONE);
3898 
3899   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3900     VM_Version::supports_avx512vlbw() &&
3901     VM_Version::supports_bmi2()) {
3902 
3903     Label test_64_loop, test_tail, BREAK_LOOP;
3904     movl(tmp1, len);
3905     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3906 
3907     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
3908     andl(len,  0xffffffc0); // vector count (in chars)
3909     jccb(Assembler::zero, test_tail);
3910 
3911     lea(ary1, Address(ary1, len, Address::times_1));
3912     negptr(len);
3913 
3914     bind(test_64_loop);
3915     // Check whether our 64 elements of size byte contain negatives
3916     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3917     kortestql(mask1, mask1);
3918     jcc(Assembler::notZero, BREAK_LOOP);
3919 
3920     addptr(len, 64);
3921     jccb(Assembler::notZero, test_64_loop);
3922 
3923     bind(test_tail);
3924     // bail out when there is nothing to be done
3925     testl(tmp1, -1);
3926     jcc(Assembler::zero, DONE);
3927 
3928 
3929     // check the tail for absense of negatives
3930     // ~(~0 << len) applied up to two times (for 32-bit scenario)
3931 #ifdef _LP64
3932     {
3933       Register tmp3_aliased = len;
3934       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3935       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3936       notq(tmp3_aliased);
3937       kmovql(mask2, tmp3_aliased);
3938     }
3939 #else
3940     Label k_init;
3941     jmp(k_init);
3942 
3943     // We could not read 64-bits from a general purpose register thus we move
3944     // data required to compose 64 1's to the instruction stream
3945     // We emit 64 byte wide series of elements from 0..63 which later on would
3946     // be used as a compare targets with tail count contained in tmp1 register.
3947     // Result would be a k register having tmp1 consecutive number or 1
3948     // counting from least significant bit.
3949     address tmp = pc();
3950     emit_int64(0x0706050403020100);
3951     emit_int64(0x0F0E0D0C0B0A0908);
3952     emit_int64(0x1716151413121110);
3953     emit_int64(0x1F1E1D1C1B1A1918);
3954     emit_int64(0x2726252423222120);
3955     emit_int64(0x2F2E2D2C2B2A2928);
3956     emit_int64(0x3736353433323130);
3957     emit_int64(0x3F3E3D3C3B3A3938);
3958 
3959     bind(k_init);
3960     lea(len, InternalAddress(tmp));
3961     // create mask to test for negative byte inside a vector
3962     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3963     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
3964 
3965 #endif
3966     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3967     ktestq(mask1, mask2);
3968     jcc(Assembler::zero, DONE);
3969 
3970     // do a full check for negative registers in the tail
3971     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
3972                      // ary1 already pointing to the right place
3973     jmpb(TAIL_START);
3974 
3975     bind(BREAK_LOOP);
3976     // At least one byte in the last 64 byte block was negative.
3977     // Set up to look at the last 64 bytes as if they were a tail
3978     lea(ary1, Address(ary1, len, Address::times_1));
3979     addptr(result, len);
3980     // Ignore the very last byte: if all others are positive,
3981     // it must be negative, so we can skip right to the 2+1 byte
3982     // end comparison at this point
3983     orl(result, 63);
3984     movl(len, 63);
3985     // Fallthru to tail compare
3986   } else {
3987 
3988     if (UseAVX >= 2 && UseSSE >= 2) {
3989       // With AVX2, use 32-byte vector compare
3990       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
3991 
3992       // Compare 32-byte vectors
3993       testl(len, 0xffffffe0);   // vector count (in bytes)
3994       jccb(Assembler::zero, TAIL_START);
3995 
3996       andl(len, 0xffffffe0);
3997       lea(ary1, Address(ary1, len, Address::times_1));
3998       negptr(len);
3999 
4000       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4001       movdl(vec2, tmp1);
4002       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4003 
4004       bind(COMPARE_WIDE_VECTORS);
4005       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4006       vptest(vec1, vec2);
4007       jccb(Assembler::notZero, BREAK_LOOP);
4008       addptr(len, 32);
4009       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4010 
4011       testl(result, 0x0000001f);   // any bytes remaining?
4012       jcc(Assembler::zero, DONE);
4013 
4014       // Quick test using the already prepared vector mask
4015       movl(len, result);
4016       andl(len, 0x0000001f);
4017       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4018       vptest(vec1, vec2);
4019       jcc(Assembler::zero, DONE);
4020       // There are zeros, jump to the tail to determine exactly where
4021       jmpb(TAIL_START);
4022 
4023       bind(BREAK_LOOP);
4024       // At least one byte in the last 32-byte vector is negative.
4025       // Set up to look at the last 32 bytes as if they were a tail
4026       lea(ary1, Address(ary1, len, Address::times_1));
4027       addptr(result, len);
4028       // Ignore the very last byte: if all others are positive,
4029       // it must be negative, so we can skip right to the 2+1 byte
4030       // end comparison at this point
4031       orl(result, 31);
4032       movl(len, 31);
4033       // Fallthru to tail compare
4034     } else if (UseSSE42Intrinsics) {
4035       // With SSE4.2, use double quad vector compare
4036       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4037 
4038       // Compare 16-byte vectors
4039       testl(len, 0xfffffff0);   // vector count (in bytes)
4040       jcc(Assembler::zero, TAIL_START);
4041 
4042       andl(len, 0xfffffff0);
4043       lea(ary1, Address(ary1, len, Address::times_1));
4044       negptr(len);
4045 
4046       movl(tmp1, 0x80808080);
4047       movdl(vec2, tmp1);
4048       pshufd(vec2, vec2, 0);
4049 
4050       bind(COMPARE_WIDE_VECTORS);
4051       movdqu(vec1, Address(ary1, len, Address::times_1));
4052       ptest(vec1, vec2);
4053       jccb(Assembler::notZero, BREAK_LOOP);
4054       addptr(len, 16);
4055       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4056 
4057       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4058       jcc(Assembler::zero, DONE);
4059 
4060       // Quick test using the already prepared vector mask
4061       movl(len, result);
4062       andl(len, 0x0000000f);   // tail count (in bytes)
4063       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4064       ptest(vec1, vec2);
4065       jcc(Assembler::zero, DONE);
4066       jmpb(TAIL_START);
4067 
4068       bind(BREAK_LOOP);
4069       // At least one byte in the last 16-byte vector is negative.
4070       // Set up and look at the last 16 bytes as if they were a tail
4071       lea(ary1, Address(ary1, len, Address::times_1));
4072       addptr(result, len);
4073       // Ignore the very last byte: if all others are positive,
4074       // it must be negative, so we can skip right to the 2+1 byte
4075       // end comparison at this point
4076       orl(result, 15);
4077       movl(len, 15);
4078       // Fallthru to tail compare
4079     }
4080   }
4081 
4082   bind(TAIL_START);
4083   // Compare 4-byte vectors
4084   andl(len, 0xfffffffc); // vector count (in bytes)
4085   jccb(Assembler::zero, COMPARE_CHAR);
4086 
4087   lea(ary1, Address(ary1, len, Address::times_1));
4088   negptr(len);
4089 
4090   bind(COMPARE_VECTORS);
4091   movl(tmp1, Address(ary1, len, Address::times_1));
4092   andl(tmp1, 0x80808080);
4093   jccb(Assembler::notZero, TAIL_ADJUST);
4094   addptr(len, 4);
4095   jccb(Assembler::notZero, COMPARE_VECTORS);
4096 
4097   // Compare trailing char (final 2-3 bytes), if any
4098   bind(COMPARE_CHAR);
4099 
4100   testl(result, 0x2);   // tail  char
4101   jccb(Assembler::zero, COMPARE_BYTE);
4102   load_unsigned_short(tmp1, Address(ary1, 0));
4103   andl(tmp1, 0x00008080);
4104   jccb(Assembler::notZero, CHAR_ADJUST);
4105   lea(ary1, Address(ary1, 2));
4106 
4107   bind(COMPARE_BYTE);
4108   testl(result, 0x1);   // tail  byte
4109   jccb(Assembler::zero, DONE);
4110   load_unsigned_byte(tmp1, Address(ary1, 0));
4111   testl(tmp1, 0x00000080);
4112   jccb(Assembler::zero, DONE);
4113   subptr(result, 1);
4114   jmpb(DONE);
4115 
4116   bind(TAIL_ADJUST);
4117   // there are negative bits in the last 4 byte block.
4118   // Adjust result and check the next three bytes
4119   addptr(result, len);
4120   orl(result, 3);
4121   lea(ary1, Address(ary1, len, Address::times_1));
4122   jmpb(COMPARE_CHAR);
4123 
4124   bind(CHAR_ADJUST);
4125   // We are looking at a char + optional byte tail, and found that one
4126   // of the bytes in the char is negative. Adjust the result, check the
4127   // first byte and readjust if needed.
4128   andl(result, 0xfffffffc);
4129   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4130   jccb(Assembler::notZero, DONE);
4131   addptr(result, 1);
4132 
4133   // That's it
4134   bind(DONE);
4135   if (UseAVX >= 2 && UseSSE >= 2) {
4136     // clean upper bits of YMM registers
4137     vpxor(vec1, vec1);
4138     vpxor(vec2, vec2);
4139   }
4140 }
4141 
4142 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4143 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4144                                       Register limit, Register result, Register chr,
4145                                       XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
4146   ShortBranchVerifier sbv(this);
4147   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4148 
4149   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4150   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4151 
4152   if (is_array_equ) {
4153     // Check the input args
4154     cmpoop(ary1, ary2);
4155     jcc(Assembler::equal, TRUE_LABEL);
4156 
4157     // Need additional checks for arrays_equals.
4158     testptr(ary1, ary1);
4159     jcc(Assembler::zero, FALSE_LABEL);
4160     testptr(ary2, ary2);
4161     jcc(Assembler::zero, FALSE_LABEL);
4162 
4163     // Check the lengths
4164     movl(limit, Address(ary1, length_offset));
4165     cmpl(limit, Address(ary2, length_offset));
4166     jcc(Assembler::notEqual, FALSE_LABEL);
4167   }
4168 
4169   // count == 0
4170   testl(limit, limit);
4171   jcc(Assembler::zero, TRUE_LABEL);
4172 
4173   if (is_array_equ) {
4174     // Load array address
4175     lea(ary1, Address(ary1, base_offset));
4176     lea(ary2, Address(ary2, base_offset));
4177   }
4178 
4179   if (is_array_equ && is_char) {
4180     // arrays_equals when used for char[].
4181     shll(limit, 1);      // byte count != 0
4182   }
4183   movl(result, limit); // copy
4184 
4185   if (UseAVX >= 2) {
4186     // With AVX2, use 32-byte vector compare
4187     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4188 
4189     // Compare 32-byte vectors
4190     andl(result, 0x0000001f);  //   tail count (in bytes)
4191     andl(limit, 0xffffffe0);   // vector count (in bytes)
4192     jcc(Assembler::zero, COMPARE_TAIL);
4193 
4194     lea(ary1, Address(ary1, limit, Address::times_1));
4195     lea(ary2, Address(ary2, limit, Address::times_1));
4196     negptr(limit);
4197 
4198 #ifdef _LP64
4199     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4200       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4201 
4202       cmpl(limit, -64);
4203       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4204 
4205       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4206 
4207       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4208       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4209       kortestql(mask, mask);
4210       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4211       addptr(limit, 64);  // update since we already compared at this addr
4212       cmpl(limit, -64);
4213       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4214 
4215       // At this point we may still need to compare -limit+result bytes.
4216       // We could execute the next two instruction and just continue via non-wide path:
4217       //  cmpl(limit, 0);
4218       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4219       // But since we stopped at the points ary{1,2}+limit which are
4220       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4221       // (|limit| <= 32 and result < 32),
4222       // we may just compare the last 64 bytes.
4223       //
4224       addptr(result, -64);   // it is safe, bc we just came from this area
4225       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4226       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4227       kortestql(mask, mask);
4228       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4229 
4230       jmp(TRUE_LABEL);
4231 
4232       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4233 
4234     }//if (VM_Version::supports_avx512vlbw())
4235 #endif //_LP64
4236     bind(COMPARE_WIDE_VECTORS);
4237     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
4238     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4239     vpxor(vec1, vec2);
4240 
4241     vptest(vec1, vec1);
4242     jcc(Assembler::notZero, FALSE_LABEL);
4243     addptr(limit, 32);
4244     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4245 
4246     testl(result, result);
4247     jcc(Assembler::zero, TRUE_LABEL);
4248 
4249     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
4250     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4251     vpxor(vec1, vec2);
4252 
4253     vptest(vec1, vec1);
4254     jccb(Assembler::notZero, FALSE_LABEL);
4255     jmpb(TRUE_LABEL);
4256 
4257     bind(COMPARE_TAIL); // limit is zero
4258     movl(limit, result);
4259     // Fallthru to tail compare
4260   } else if (UseSSE42Intrinsics) {
4261     // With SSE4.2, use double quad vector compare
4262     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4263 
4264     // Compare 16-byte vectors
4265     andl(result, 0x0000000f);  //   tail count (in bytes)
4266     andl(limit, 0xfffffff0);   // vector count (in bytes)
4267     jcc(Assembler::zero, COMPARE_TAIL);
4268 
4269     lea(ary1, Address(ary1, limit, Address::times_1));
4270     lea(ary2, Address(ary2, limit, Address::times_1));
4271     negptr(limit);
4272 
4273     bind(COMPARE_WIDE_VECTORS);
4274     movdqu(vec1, Address(ary1, limit, Address::times_1));
4275     movdqu(vec2, Address(ary2, limit, Address::times_1));
4276     pxor(vec1, vec2);
4277 
4278     ptest(vec1, vec1);
4279     jcc(Assembler::notZero, FALSE_LABEL);
4280     addptr(limit, 16);
4281     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4282 
4283     testl(result, result);
4284     jcc(Assembler::zero, TRUE_LABEL);
4285 
4286     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4287     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4288     pxor(vec1, vec2);
4289 
4290     ptest(vec1, vec1);
4291     jccb(Assembler::notZero, FALSE_LABEL);
4292     jmpb(TRUE_LABEL);
4293 
4294     bind(COMPARE_TAIL); // limit is zero
4295     movl(limit, result);
4296     // Fallthru to tail compare
4297   }
4298 
4299   // Compare 4-byte vectors
4300   andl(limit, 0xfffffffc); // vector count (in bytes)
4301   jccb(Assembler::zero, COMPARE_CHAR);
4302 
4303   lea(ary1, Address(ary1, limit, Address::times_1));
4304   lea(ary2, Address(ary2, limit, Address::times_1));
4305   negptr(limit);
4306 
4307   bind(COMPARE_VECTORS);
4308   movl(chr, Address(ary1, limit, Address::times_1));
4309   cmpl(chr, Address(ary2, limit, Address::times_1));
4310   jccb(Assembler::notEqual, FALSE_LABEL);
4311   addptr(limit, 4);
4312   jcc(Assembler::notZero, COMPARE_VECTORS);
4313 
4314   // Compare trailing char (final 2 bytes), if any
4315   bind(COMPARE_CHAR);
4316   testl(result, 0x2);   // tail  char
4317   jccb(Assembler::zero, COMPARE_BYTE);
4318   load_unsigned_short(chr, Address(ary1, 0));
4319   load_unsigned_short(limit, Address(ary2, 0));
4320   cmpl(chr, limit);
4321   jccb(Assembler::notEqual, FALSE_LABEL);
4322 
4323   if (is_array_equ && is_char) {
4324     bind(COMPARE_BYTE);
4325   } else {
4326     lea(ary1, Address(ary1, 2));
4327     lea(ary2, Address(ary2, 2));
4328 
4329     bind(COMPARE_BYTE);
4330     testl(result, 0x1);   // tail  byte
4331     jccb(Assembler::zero, TRUE_LABEL);
4332     load_unsigned_byte(chr, Address(ary1, 0));
4333     load_unsigned_byte(limit, Address(ary2, 0));
4334     cmpl(chr, limit);
4335     jccb(Assembler::notEqual, FALSE_LABEL);
4336   }
4337   bind(TRUE_LABEL);
4338   movl(result, 1);   // return true
4339   jmpb(DONE);
4340 
4341   bind(FALSE_LABEL);
4342   xorl(result, result); // return false
4343 
4344   // That's it
4345   bind(DONE);
4346   if (UseAVX >= 2) {
4347     // clean upper bits of YMM registers
4348     vpxor(vec1, vec1);
4349     vpxor(vec2, vec2);
4350   }
4351 }
4352 
4353 #ifdef _LP64
4354 
4355 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4356 #define __ masm.
4357   Register dst = stub.data<0>();
4358   XMMRegister src = stub.data<1>();
4359   address target = stub.data<2>();
4360   __ bind(stub.entry());
4361   __ subptr(rsp, 8);
4362   __ movdbl(Address(rsp), src);
4363   __ call(RuntimeAddress(target));
4364   __ pop(dst);
4365   __ jmp(stub.continuation());
4366 #undef __
4367 }
4368 
4369 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4370   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4371   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4372 
4373   address slowpath_target;
4374   if (dst_bt == T_INT) {
4375     if (src_bt == T_FLOAT) {
4376       cvttss2sil(dst, src);
4377       cmpl(dst, 0x80000000);
4378       slowpath_target = StubRoutines::x86::f2i_fixup();
4379     } else {
4380       cvttsd2sil(dst, src);
4381       cmpl(dst, 0x80000000);
4382       slowpath_target = StubRoutines::x86::d2i_fixup();
4383     }
4384   } else {
4385     if (src_bt == T_FLOAT) {
4386       cvttss2siq(dst, src);
4387       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4388       slowpath_target = StubRoutines::x86::f2l_fixup();
4389     } else {
4390       cvttsd2siq(dst, src);
4391       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4392       slowpath_target = StubRoutines::x86::d2l_fixup();
4393     }
4394   }
4395 
4396   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4397   jcc(Assembler::equal, stub->entry());
4398   bind(stub->continuation());
4399 }
4400 
4401 #endif // _LP64
4402 
4403 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4404                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4405   switch(ideal_opc) {
4406     case Op_LShiftVS:
4407       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4408     case Op_LShiftVI:
4409       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4410     case Op_LShiftVL:
4411       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4412     case Op_RShiftVS:
4413       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4414     case Op_RShiftVI:
4415       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4416     case Op_RShiftVL:
4417       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4418     case Op_URShiftVS:
4419       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4420     case Op_URShiftVI:
4421       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4422     case Op_URShiftVL:
4423       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4424     case Op_RotateRightV:
4425       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4426     case Op_RotateLeftV:
4427       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4428     default:
4429       fatal("Unsupported masked operation"); break;
4430   }
4431 }
4432 
4433 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4434                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4435                                     bool is_varshift) {
4436   switch (ideal_opc) {
4437     case Op_AddVB:
4438       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4439     case Op_AddVS:
4440       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4441     case Op_AddVI:
4442       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4443     case Op_AddVL:
4444       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4445     case Op_AddVF:
4446       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4447     case Op_AddVD:
4448       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4449     case Op_SubVB:
4450       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4451     case Op_SubVS:
4452       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4453     case Op_SubVI:
4454       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4455     case Op_SubVL:
4456       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4457     case Op_SubVF:
4458       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4459     case Op_SubVD:
4460       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4461     case Op_MulVS:
4462       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4463     case Op_MulVI:
4464       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4465     case Op_MulVL:
4466       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4467     case Op_MulVF:
4468       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4469     case Op_MulVD:
4470       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4471     case Op_DivVF:
4472       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4473     case Op_DivVD:
4474       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4475     case Op_SqrtVF:
4476       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4477     case Op_SqrtVD:
4478       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4479     case Op_AbsVB:
4480       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4481     case Op_AbsVS:
4482       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4483     case Op_AbsVI:
4484       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4485     case Op_AbsVL:
4486       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4487     case Op_FmaVF:
4488       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4489     case Op_FmaVD:
4490       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4491     case Op_VectorRearrange:
4492       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4493     case Op_LShiftVS:
4494       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4495     case Op_LShiftVI:
4496       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4497     case Op_LShiftVL:
4498       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4499     case Op_RShiftVS:
4500       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4501     case Op_RShiftVI:
4502       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4503     case Op_RShiftVL:
4504       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4505     case Op_URShiftVS:
4506       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4507     case Op_URShiftVI:
4508       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4509     case Op_URShiftVL:
4510       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4511     case Op_RotateLeftV:
4512       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4513     case Op_RotateRightV:
4514       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4515     case Op_MaxV:
4516       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4517     case Op_MinV:
4518       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4519     case Op_XorV:
4520       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4521     case Op_OrV:
4522       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4523     case Op_AndV:
4524       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4525     default:
4526       fatal("Unsupported masked operation"); break;
4527   }
4528 }
4529 
4530 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4531                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4532   switch (ideal_opc) {
4533     case Op_AddVB:
4534       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4535     case Op_AddVS:
4536       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4537     case Op_AddVI:
4538       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4539     case Op_AddVL:
4540       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4541     case Op_AddVF:
4542       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4543     case Op_AddVD:
4544       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4545     case Op_SubVB:
4546       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4547     case Op_SubVS:
4548       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4549     case Op_SubVI:
4550       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4551     case Op_SubVL:
4552       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4553     case Op_SubVF:
4554       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4555     case Op_SubVD:
4556       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4557     case Op_MulVS:
4558       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4559     case Op_MulVI:
4560       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4561     case Op_MulVL:
4562       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4563     case Op_MulVF:
4564       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4565     case Op_MulVD:
4566       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4567     case Op_DivVF:
4568       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4569     case Op_DivVD:
4570       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4571     case Op_FmaVF:
4572       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4573     case Op_FmaVD:
4574       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4575     case Op_MaxV:
4576       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4577     case Op_MinV:
4578       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4579     case Op_XorV:
4580       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4581     case Op_OrV:
4582       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4583     case Op_AndV:
4584       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4585     default:
4586       fatal("Unsupported masked operation"); break;
4587   }
4588 }
4589 
4590 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4591                                   KRegister src1, KRegister src2) {
4592   BasicType etype = T_ILLEGAL;
4593   switch(mask_len) {
4594     case 2:
4595     case 4:
4596     case 8:  etype = T_BYTE; break;
4597     case 16: etype = T_SHORT; break;
4598     case 32: etype = T_INT; break;
4599     case 64: etype = T_LONG; break;
4600     default: fatal("Unsupported type"); break;
4601   }
4602   assert(etype != T_ILLEGAL, "");
4603   switch(ideal_opc) {
4604     case Op_AndVMask:
4605       kand(etype, dst, src1, src2); break;
4606     case Op_OrVMask:
4607       kor(etype, dst, src1, src2); break;
4608     case Op_XorVMask:
4609       kxor(etype, dst, src1, src2); break;
4610     default:
4611       fatal("Unsupported masked operation"); break;
4612   }
4613 }
4614 
4615 /*
4616  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4617  * If src is NaN, the result is 0.
4618  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4619  * the result is equal to the value of Integer.MIN_VALUE.
4620  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4621  * the result is equal to the value of Integer.MAX_VALUE.
4622  */
4623 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4624                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4625                                                                    Register rscratch, AddressLiteral float_sign_flip,
4626                                                                    int vec_enc) {
4627   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4628   Label done;
4629   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4630   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4631   vptest(xtmp2, xtmp2, vec_enc);
4632   jccb(Assembler::equal, done);
4633 
4634   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4635   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4636 
4637   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4638   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4639   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4640 
4641   // Recompute the mask for remaining special value.
4642   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4643   // Extract SRC values corresponding to TRUE mask lanes.
4644   vpand(xtmp4, xtmp2, src, vec_enc);
4645   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4646   // values are set.
4647   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4648 
4649   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4650   bind(done);
4651 }
4652 
4653 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4654                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4655                                                                     Register rscratch, AddressLiteral float_sign_flip,
4656                                                                     int vec_enc) {
4657   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4658   Label done;
4659   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4660   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4661   kortestwl(ktmp1, ktmp1);
4662   jccb(Assembler::equal, done);
4663 
4664   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4665   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4666   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4667 
4668   kxorwl(ktmp1, ktmp1, ktmp2);
4669   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4670   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4671   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4672   bind(done);
4673 }
4674 
4675 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4676                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4677                                                                      Register rscratch, AddressLiteral double_sign_flip,
4678                                                                      int vec_enc) {
4679   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4680 
4681   Label done;
4682   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4683   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4684   kortestwl(ktmp1, ktmp1);
4685   jccb(Assembler::equal, done);
4686 
4687   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4688   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4689   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4690 
4691   kxorwl(ktmp1, ktmp1, ktmp2);
4692   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4693   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4694   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4695   bind(done);
4696 }
4697 
4698 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4699                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4700                                                                      Register rscratch, AddressLiteral float_sign_flip,
4701                                                                      int vec_enc) {
4702   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4703   Label done;
4704   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4705   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4706   kortestwl(ktmp1, ktmp1);
4707   jccb(Assembler::equal, done);
4708 
4709   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4710   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4711   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4712 
4713   kxorwl(ktmp1, ktmp1, ktmp2);
4714   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4715   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4716   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4717   bind(done);
4718 }
4719 
4720 /*
4721  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4722  * If src is NaN, the result is 0.
4723  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4724  * the result is equal to the value of Long.MIN_VALUE.
4725  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4726  * the result is equal to the value of Long.MAX_VALUE.
4727  */
4728 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4729                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4730                                                                       Register rscratch, AddressLiteral double_sign_flip,
4731                                                                       int vec_enc) {
4732   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4733 
4734   Label done;
4735   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4736   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4737   kortestwl(ktmp1, ktmp1);
4738   jccb(Assembler::equal, done);
4739 
4740   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4741   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4742   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4743 
4744   kxorwl(ktmp1, ktmp1, ktmp2);
4745   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4746   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4747   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4748   bind(done);
4749 }
4750 
4751 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
4752                                                              XMMRegister xtmp, int index, int vec_enc) {
4753    assert(vec_enc < Assembler::AVX_512bit, "");
4754    if (vec_enc == Assembler::AVX_256bit) {
4755      vextractf128_high(xtmp, src);
4756      vshufps(dst, src, xtmp, index, vec_enc);
4757    } else {
4758      vshufps(dst, src, zero, index, vec_enc);
4759    }
4760 }
4761 
4762 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4763                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
4764                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
4765   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4766 
4767   Label done;
4768   // Compare the destination lanes with float_sign_flip
4769   // value to get mask for all special values.
4770   movdqu(xtmp1, float_sign_flip, rscratch);
4771   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
4772   ptest(xtmp2, xtmp2);
4773   jccb(Assembler::equal, done);
4774 
4775   // Flip float_sign_flip to get max integer value.
4776   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
4777   pxor(xtmp1, xtmp4);
4778 
4779   // Set detination lanes corresponding to unordered source lanes as zero.
4780   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
4781   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
4782 
4783   // Shuffle mask vector and pack lower doubles word from each quadword lane.
4784   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
4785   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
4786 
4787   // Recompute the mask for remaining special value.
4788   pxor(xtmp2, xtmp3);
4789   // Extract mask corresponding to non-negative source lanes.
4790   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
4791 
4792   // Shuffle mask vector and pack lower doubles word from each quadword lane.
4793   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
4794   pand(xtmp3, xtmp2);
4795 
4796   // Replace destination lanes holding special value(0x80000000) with max int
4797   // if corresponding source lane holds a +ve value.
4798   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
4799   bind(done);
4800 }
4801 
4802 
4803 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
4804                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
4805   switch(to_elem_bt) {
4806     case T_SHORT:
4807       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
4808       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
4809       vpackusdw(dst, dst, zero, vec_enc);
4810       if (vec_enc == Assembler::AVX_256bit) {
4811         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
4812       }
4813       break;
4814     case  T_BYTE:
4815       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
4816       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
4817       vpackusdw(dst, dst, zero, vec_enc);
4818       if (vec_enc == Assembler::AVX_256bit) {
4819         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
4820       }
4821       vpackuswb(dst, dst, zero, vec_enc);
4822       break;
4823     default: assert(false, "%s", type2name(to_elem_bt));
4824   }
4825 }
4826 
4827 /*
4828  * Algorithm for vector D2L and F2I conversions:-
4829  * a) Perform vector D2L/F2I cast.
4830  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
4831  *    It signifies that source value could be any of the special floating point
4832  *    values(NaN,-Inf,Inf,Max,-Min).
4833  * c) Set destination to zero if source is NaN value.
4834  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
4835  */
4836 
4837 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4838                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4839                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
4840   int to_elem_sz = type2aelembytes(to_elem_bt);
4841   assert(to_elem_sz <= 4, "");
4842   vcvttps2dq(dst, src, vec_enc);
4843   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
4844   if (to_elem_sz < 4) {
4845     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4846     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
4847   }
4848 }
4849 
4850 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4851                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
4852                                             Register rscratch, int vec_enc) {
4853   int to_elem_sz = type2aelembytes(to_elem_bt);
4854   assert(to_elem_sz <= 4, "");
4855   vcvttps2dq(dst, src, vec_enc);
4856   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
4857   switch(to_elem_bt) {
4858     case T_INT:
4859       break;
4860     case T_SHORT:
4861       evpmovdw(dst, dst, vec_enc);
4862       break;
4863     case T_BYTE:
4864       evpmovdb(dst, dst, vec_enc);
4865       break;
4866     default: assert(false, "%s", type2name(to_elem_bt));
4867   }
4868 }
4869 
4870 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4871                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
4872                                             Register rscratch, int vec_enc) {
4873   evcvttps2qq(dst, src, vec_enc);
4874   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
4875 }
4876 
4877 // Handling for downcasting from double to integer or sub-word types on AVX2.
4878 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4879                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
4880                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
4881   int to_elem_sz = type2aelembytes(to_elem_bt);
4882   assert(to_elem_sz < 8, "");
4883   vcvttpd2dq(dst, src, vec_enc);
4884   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
4885                                               float_sign_flip, vec_enc);
4886   if (to_elem_sz < 4) {
4887     // xtmp4 holds all zero lanes.
4888     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
4889   }
4890 }
4891 
4892 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
4893                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
4894                                             KRegister ktmp2, AddressLiteral sign_flip,
4895                                             Register rscratch, int vec_enc) {
4896   if (VM_Version::supports_avx512dq()) {
4897     evcvttpd2qq(dst, src, vec_enc);
4898     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
4899     switch(to_elem_bt) {
4900       case T_LONG:
4901         break;
4902       case T_INT:
4903         evpmovsqd(dst, dst, vec_enc);
4904         break;
4905       case T_SHORT:
4906         evpmovsqd(dst, dst, vec_enc);
4907         evpmovdw(dst, dst, vec_enc);
4908         break;
4909       case T_BYTE:
4910         evpmovsqd(dst, dst, vec_enc);
4911         evpmovdb(dst, dst, vec_enc);
4912         break;
4913       default: assert(false, "%s", type2name(to_elem_bt));
4914     }
4915   } else {
4916     assert(type2aelembytes(to_elem_bt) <= 4, "");
4917     vcvttpd2dq(dst, src, vec_enc);
4918     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
4919     switch(to_elem_bt) {
4920       case T_INT:
4921         break;
4922       case T_SHORT:
4923         evpmovdw(dst, dst, vec_enc);
4924         break;
4925       case T_BYTE:
4926         evpmovdb(dst, dst, vec_enc);
4927         break;
4928       default: assert(false, "%s", type2name(to_elem_bt));
4929     }
4930   }
4931 }
4932 
4933 #ifdef _LP64
4934 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
4935                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4936                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
4937   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4938   // and re-instantiate original MXCSR.RC mode after that.
4939   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4940 
4941   mov64(tmp, julong_cast(0.5L));
4942   evpbroadcastq(xtmp1, tmp, vec_enc);
4943   vaddpd(xtmp1, src , xtmp1, vec_enc);
4944   evcvtpd2qq(dst, xtmp1, vec_enc);
4945   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
4946                                                 double_sign_flip, vec_enc);;
4947 
4948   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4949 }
4950 
4951 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
4952                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4953                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
4954   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4955   // and re-instantiate original MXCSR.RC mode after that.
4956   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4957 
4958   movl(tmp, jint_cast(0.5));
4959   movq(xtmp1, tmp);
4960   vbroadcastss(xtmp1, xtmp1, vec_enc);
4961   vaddps(xtmp1, src , xtmp1, vec_enc);
4962   vcvtps2dq(dst, xtmp1, vec_enc);
4963   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
4964                                               float_sign_flip, vec_enc);
4965 
4966   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4967 }
4968 
4969 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
4970                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4971                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
4972   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4973   // and re-instantiate original MXCSR.RC mode after that.
4974   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4975 
4976   movl(tmp, jint_cast(0.5));
4977   movq(xtmp1, tmp);
4978   vbroadcastss(xtmp1, xtmp1, vec_enc);
4979   vaddps(xtmp1, src , xtmp1, vec_enc);
4980   vcvtps2dq(dst, xtmp1, vec_enc);
4981   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
4982 
4983   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4984 }
4985 #endif // _LP64
4986 
4987 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
4988                                              BasicType from_elem_bt, BasicType to_elem_bt) {
4989   switch (from_elem_bt) {
4990     case T_BYTE:
4991       switch (to_elem_bt) {
4992         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
4993         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
4994         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
4995         default: ShouldNotReachHere();
4996       }
4997       break;
4998     case T_SHORT:
4999       switch (to_elem_bt) {
5000         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5001         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5002         default: ShouldNotReachHere();
5003       }
5004       break;
5005     case T_INT:
5006       assert(to_elem_bt == T_LONG, "");
5007       vpmovzxdq(dst, src, vlen_enc);
5008       break;
5009     default:
5010       ShouldNotReachHere();
5011   }
5012 }
5013 
5014 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5015                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5016   switch (from_elem_bt) {
5017     case T_BYTE:
5018       switch (to_elem_bt) {
5019         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5020         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5021         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5022         default: ShouldNotReachHere();
5023       }
5024       break;
5025     case T_SHORT:
5026       switch (to_elem_bt) {
5027         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5028         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5029         default: ShouldNotReachHere();
5030       }
5031       break;
5032     case T_INT:
5033       assert(to_elem_bt == T_LONG, "");
5034       vpmovsxdq(dst, src, vlen_enc);
5035       break;
5036     default:
5037       ShouldNotReachHere();
5038   }
5039 }
5040 
5041 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5042                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5043   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5044   assert(vlen_enc != AVX_512bit, "");
5045 
5046   int dst_bt_size = type2aelembytes(dst_bt);
5047   int src_bt_size = type2aelembytes(src_bt);
5048   if (dst_bt_size > src_bt_size) {
5049     switch (dst_bt_size / src_bt_size) {
5050       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5051       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5052       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5053       default: ShouldNotReachHere();
5054     }
5055   } else {
5056     assert(dst_bt_size < src_bt_size, "");
5057     switch (src_bt_size / dst_bt_size) {
5058       case 2: {
5059         if (vlen_enc == AVX_128bit) {
5060           vpacksswb(dst, src, src, vlen_enc);
5061         } else {
5062           vpacksswb(dst, src, src, vlen_enc);
5063           vpermq(dst, dst, 0x08, vlen_enc);
5064         }
5065         break;
5066       }
5067       case 4: {
5068         if (vlen_enc == AVX_128bit) {
5069           vpackssdw(dst, src, src, vlen_enc);
5070           vpacksswb(dst, dst, dst, vlen_enc);
5071         } else {
5072           vpackssdw(dst, src, src, vlen_enc);
5073           vpermq(dst, dst, 0x08, vlen_enc);
5074           vpacksswb(dst, dst, dst, AVX_128bit);
5075         }
5076         break;
5077       }
5078       case 8: {
5079         if (vlen_enc == AVX_128bit) {
5080           vpshufd(dst, src, 0x08, vlen_enc);
5081           vpackssdw(dst, dst, dst, vlen_enc);
5082           vpacksswb(dst, dst, dst, vlen_enc);
5083         } else {
5084           vpshufd(dst, src, 0x08, vlen_enc);
5085           vpermq(dst, dst, 0x08, vlen_enc);
5086           vpackssdw(dst, dst, dst, AVX_128bit);
5087           vpacksswb(dst, dst, dst, AVX_128bit);
5088         }
5089         break;
5090       }
5091       default: ShouldNotReachHere();
5092     }
5093   }
5094 }
5095 
5096 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5097                                    bool merge, BasicType bt, int vlen_enc) {
5098   if (bt == T_INT) {
5099     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5100   } else {
5101     assert(bt == T_LONG, "");
5102     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5103   }
5104 }
5105 
5106 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5107                                    bool merge, BasicType bt, int vlen_enc) {
5108   if (bt == T_INT) {
5109     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5110   } else {
5111     assert(bt == T_LONG, "");
5112     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5113   }
5114 }
5115 
5116 #ifdef _LP64
5117 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5118                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5119                                                int vec_enc) {
5120   int index = 0;
5121   int vindex = 0;
5122   mov64(rtmp1, 0x0101010101010101L);
5123   pdepq(rtmp1, src, rtmp1);
5124   if (mask_len > 8) {
5125     movq(rtmp2, src);
5126     vpxor(xtmp, xtmp, xtmp, vec_enc);
5127     movq(xtmp, rtmp1);
5128   }
5129   movq(dst, rtmp1);
5130 
5131   mask_len -= 8;
5132   while (mask_len > 0) {
5133     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5134     index++;
5135     if ((index % 2) == 0) {
5136       pxor(xtmp, xtmp);
5137     }
5138     mov64(rtmp1, 0x0101010101010101L);
5139     shrq(rtmp2, 8);
5140     pdepq(rtmp1, rtmp2, rtmp1);
5141     pinsrq(xtmp, rtmp1, index % 2);
5142     vindex = index / 2;
5143     if (vindex) {
5144       // Write entire 16 byte vector when both 64 bit
5145       // lanes are update to save redundant instructions.
5146       if (index % 2) {
5147         vinsertf128(dst, dst, xtmp, vindex);
5148       }
5149     } else {
5150       vmovdqu(dst, xtmp);
5151     }
5152     mask_len -= 8;
5153   }
5154 }
5155 
5156 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5157   switch(opc) {
5158     case Op_VectorMaskTrueCount:
5159       popcntq(dst, tmp);
5160       break;
5161     case Op_VectorMaskLastTrue:
5162       if (VM_Version::supports_lzcnt()) {
5163         lzcntq(tmp, tmp);
5164         movl(dst, 63);
5165         subl(dst, tmp);
5166       } else {
5167         movl(dst, -1);
5168         bsrq(tmp, tmp);
5169         cmov32(Assembler::notZero, dst, tmp);
5170       }
5171       break;
5172     case Op_VectorMaskFirstTrue:
5173       if (VM_Version::supports_bmi1()) {
5174         if (masklen < 32) {
5175           orl(tmp, 1 << masklen);
5176           tzcntl(dst, tmp);
5177         } else if (masklen == 32) {
5178           tzcntl(dst, tmp);
5179         } else {
5180           assert(masklen == 64, "");
5181           tzcntq(dst, tmp);
5182         }
5183       } else {
5184         if (masklen < 32) {
5185           orl(tmp, 1 << masklen);
5186           bsfl(dst, tmp);
5187         } else {
5188           assert(masklen == 32 || masklen == 64, "");
5189           movl(dst, masklen);
5190           if (masklen == 32)  {
5191             bsfl(tmp, tmp);
5192           } else {
5193             bsfq(tmp, tmp);
5194           }
5195           cmov32(Assembler::notZero, dst, tmp);
5196         }
5197       }
5198       break;
5199     case Op_VectorMaskToLong:
5200       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5201       break;
5202     default: assert(false, "Unhandled mask operation");
5203   }
5204 }
5205 
5206 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5207                                               int masklen, int masksize, int vec_enc) {
5208   assert(VM_Version::supports_popcnt(), "");
5209 
5210   if(VM_Version::supports_avx512bw()) {
5211     kmovql(tmp, mask);
5212   } else {
5213     assert(masklen <= 16, "");
5214     kmovwl(tmp, mask);
5215   }
5216 
5217   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5218   // operations needs to be clipped.
5219   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5220     andq(tmp, (1 << masklen) - 1);
5221   }
5222 
5223   vector_mask_operation_helper(opc, dst, tmp, masklen);
5224 }
5225 
5226 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5227                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5228   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5229          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5230   assert(VM_Version::supports_popcnt(), "");
5231 
5232   bool need_clip = false;
5233   switch(bt) {
5234     case T_BOOLEAN:
5235       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5236       vpxor(xtmp, xtmp, xtmp, vec_enc);
5237       vpsubb(xtmp, xtmp, mask, vec_enc);
5238       vpmovmskb(tmp, xtmp, vec_enc);
5239       need_clip = masklen < 16;
5240       break;
5241     case T_BYTE:
5242       vpmovmskb(tmp, mask, vec_enc);
5243       need_clip = masklen < 16;
5244       break;
5245     case T_SHORT:
5246       vpacksswb(xtmp, mask, mask, vec_enc);
5247       if (masklen >= 16) {
5248         vpermpd(xtmp, xtmp, 8, vec_enc);
5249       }
5250       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5251       need_clip = masklen < 16;
5252       break;
5253     case T_INT:
5254     case T_FLOAT:
5255       vmovmskps(tmp, mask, vec_enc);
5256       need_clip = masklen < 4;
5257       break;
5258     case T_LONG:
5259     case T_DOUBLE:
5260       vmovmskpd(tmp, mask, vec_enc);
5261       need_clip = masklen < 2;
5262       break;
5263     default: assert(false, "Unhandled type, %s", type2name(bt));
5264   }
5265 
5266   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5267   // operations needs to be clipped.
5268   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5269     // need_clip implies masklen < 32
5270     andq(tmp, (1 << masklen) - 1);
5271   }
5272 
5273   vector_mask_operation_helper(opc, dst, tmp, masklen);
5274 }
5275 
5276 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5277                                              Register rtmp2, int mask_len) {
5278   kmov(rtmp1, src);
5279   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5280   mov64(rtmp2, -1L);
5281   pextq(rtmp2, rtmp2, rtmp1);
5282   kmov(dst, rtmp2);
5283 }
5284 
5285 #ifdef _LP64
5286 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5287                                                     XMMRegister mask, Register rtmp, Register rscratch,
5288                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5289                                                     int vec_enc) {
5290   assert(type2aelembytes(bt) >= 4, "");
5291   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5292   address compress_perm_table = nullptr;
5293   address expand_perm_table = nullptr;
5294   if (type2aelembytes(bt) == 8) {
5295     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5296     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5297     vmovmskpd(rtmp, mask, vec_enc);
5298   } else {
5299     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5300     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5301     vmovmskps(rtmp, mask, vec_enc);
5302   }
5303   shlq(rtmp, 5); // for 32 byte permute row.
5304   if (opcode == Op_CompressV) {
5305     lea(rscratch, ExternalAddress(compress_perm_table));
5306   } else {
5307     lea(rscratch, ExternalAddress(expand_perm_table));
5308   }
5309   addptr(rtmp, rscratch);
5310   vmovdqu(permv, Address(rtmp));
5311   vpermps(dst, permv, src, Assembler::AVX_256bit);
5312   vpxor(xtmp, xtmp, xtmp, vec_enc);
5313   // Blend the result with zero vector using permute mask, each column entry
5314   // in a permute table row contains either a valid permute index or a -1 (default)
5315   // value, this can potentially be used as a blending mask after
5316   // compressing/expanding the source vector lanes.
5317   vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
5318 }
5319 #endif
5320 
5321 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5322                                                bool merge, BasicType bt, int vec_enc) {
5323   if (opcode == Op_CompressV) {
5324     switch(bt) {
5325     case T_BYTE:
5326       evpcompressb(dst, mask, src, merge, vec_enc);
5327       break;
5328     case T_CHAR:
5329     case T_SHORT:
5330       evpcompressw(dst, mask, src, merge, vec_enc);
5331       break;
5332     case T_INT:
5333       evpcompressd(dst, mask, src, merge, vec_enc);
5334       break;
5335     case T_FLOAT:
5336       evcompressps(dst, mask, src, merge, vec_enc);
5337       break;
5338     case T_LONG:
5339       evpcompressq(dst, mask, src, merge, vec_enc);
5340       break;
5341     case T_DOUBLE:
5342       evcompresspd(dst, mask, src, merge, vec_enc);
5343       break;
5344     default:
5345       fatal("Unsupported type %s", type2name(bt));
5346       break;
5347     }
5348   } else {
5349     assert(opcode == Op_ExpandV, "");
5350     switch(bt) {
5351     case T_BYTE:
5352       evpexpandb(dst, mask, src, merge, vec_enc);
5353       break;
5354     case T_CHAR:
5355     case T_SHORT:
5356       evpexpandw(dst, mask, src, merge, vec_enc);
5357       break;
5358     case T_INT:
5359       evpexpandd(dst, mask, src, merge, vec_enc);
5360       break;
5361     case T_FLOAT:
5362       evexpandps(dst, mask, src, merge, vec_enc);
5363       break;
5364     case T_LONG:
5365       evpexpandq(dst, mask, src, merge, vec_enc);
5366       break;
5367     case T_DOUBLE:
5368       evexpandpd(dst, mask, src, merge, vec_enc);
5369       break;
5370     default:
5371       fatal("Unsupported type %s", type2name(bt));
5372       break;
5373     }
5374   }
5375 }
5376 #endif
5377 
5378 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5379                                            KRegister ktmp1, int vec_enc) {
5380   if (opcode == Op_SignumVD) {
5381     vsubpd(dst, zero, one, vec_enc);
5382     // if src < 0 ? -1 : 1
5383     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5384     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5385     // if src == NaN, -0.0 or 0.0 return src.
5386     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5387     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5388   } else {
5389     assert(opcode == Op_SignumVF, "");
5390     vsubps(dst, zero, one, vec_enc);
5391     // if src < 0 ? -1 : 1
5392     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5393     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5394     // if src == NaN, -0.0 or 0.0 return src.
5395     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5396     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5397   }
5398 }
5399 
5400 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5401                                           XMMRegister xtmp1, int vec_enc) {
5402   if (opcode == Op_SignumVD) {
5403     vsubpd(dst, zero, one, vec_enc);
5404     // if src < 0 ? -1 : 1
5405     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5406     // if src == NaN, -0.0 or 0.0 return src.
5407     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5408     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5409   } else {
5410     assert(opcode == Op_SignumVF, "");
5411     vsubps(dst, zero, one, vec_enc);
5412     // if src < 0 ? -1 : 1
5413     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5414     // if src == NaN, -0.0 or 0.0 return src.
5415     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5416     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5417   }
5418 }
5419 
5420 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5421   if (VM_Version::supports_avx512bw()) {
5422     if (mask_len > 32) {
5423       kmovql(dst, src);
5424     } else {
5425       kmovdl(dst, src);
5426       if (mask_len != 32) {
5427         kshiftrdl(dst, dst, 32 - mask_len);
5428       }
5429     }
5430   } else {
5431     assert(mask_len <= 16, "");
5432     kmovwl(dst, src);
5433     if (mask_len != 16) {
5434       kshiftrwl(dst, dst, 16 - mask_len);
5435     }
5436   }
5437 }
5438 
5439 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5440   int lane_size = type2aelembytes(bt);
5441   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5442   if ((is_LP64 || lane_size < 8) &&
5443       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5444        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5445     movptr(rtmp, imm32);
5446     switch(lane_size) {
5447       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5448       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5449       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5450       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5451       fatal("Unsupported lane size %d", lane_size);
5452       break;
5453     }
5454   } else {
5455     movptr(rtmp, imm32);
5456     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5457     switch(lane_size) {
5458       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5459       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5460       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5461       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5462       fatal("Unsupported lane size %d", lane_size);
5463       break;
5464     }
5465   }
5466 }
5467 
5468 //
5469 // Following is lookup table based popcount computation algorithm:-
5470 //       Index   Bit set count
5471 //     [ 0000 ->   0,
5472 //       0001 ->   1,
5473 //       0010 ->   1,
5474 //       0011 ->   2,
5475 //       0100 ->   1,
5476 //       0101 ->   2,
5477 //       0110 ->   2,
5478 //       0111 ->   3,
5479 //       1000 ->   1,
5480 //       1001 ->   2,
5481 //       1010 ->   3,
5482 //       1011 ->   3,
5483 //       1100 ->   2,
5484 //       1101 ->   3,
5485 //       1111 ->   4 ]
5486 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5487 //     shuffle indices for lookup table access.
5488 //  b. Right shift each byte of vector lane by 4 positions.
5489 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5490 //     shuffle indices for lookup table access.
5491 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5492 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5493 //     count of all the bytes of a quadword.
5494 //  f. Perform step e. for upper 128bit vector lane.
5495 //  g. Pack the bitset count of quadwords back to double word.
5496 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5497 
5498 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5499                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5500   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5501   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5502   vpsrlw(dst, src, 4, vec_enc);
5503   vpand(dst, dst, xtmp1, vec_enc);
5504   vpand(xtmp1, src, xtmp1, vec_enc);
5505   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5506   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5507   vpshufb(dst, xtmp2, dst, vec_enc);
5508   vpaddb(dst, dst, xtmp1, vec_enc);
5509 }
5510 
5511 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5512                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5513   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5514   // Following code is as per steps e,f,g and h of above algorithm.
5515   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5516   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5517   vpsadbw(dst, dst, xtmp2, vec_enc);
5518   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5519   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5520   vpackuswb(dst, xtmp1, dst, vec_enc);
5521 }
5522 
5523 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5524                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5525   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5526   // Add the popcount of upper and lower bytes of word.
5527   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5528   vpsrlw(dst, xtmp1, 8, vec_enc);
5529   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5530   vpaddw(dst, dst, xtmp1, vec_enc);
5531 }
5532 
5533 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5534                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5535   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5536   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5537   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5538 }
5539 
5540 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5541                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5542   switch(bt) {
5543     case T_LONG:
5544       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5545       break;
5546     case T_INT:
5547       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5548       break;
5549     case T_CHAR:
5550     case T_SHORT:
5551       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5552       break;
5553     case T_BYTE:
5554     case T_BOOLEAN:
5555       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5556       break;
5557     default:
5558       fatal("Unsupported type %s", type2name(bt));
5559       break;
5560   }
5561 }
5562 
5563 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5564                                                       KRegister mask, bool merge, int vec_enc) {
5565   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5566   switch(bt) {
5567     case T_LONG:
5568       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5569       evpopcntq(dst, mask, src, merge, vec_enc);
5570       break;
5571     case T_INT:
5572       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5573       evpopcntd(dst, mask, src, merge, vec_enc);
5574       break;
5575     case T_CHAR:
5576     case T_SHORT:
5577       assert(VM_Version::supports_avx512_bitalg(), "");
5578       evpopcntw(dst, mask, src, merge, vec_enc);
5579       break;
5580     case T_BYTE:
5581     case T_BOOLEAN:
5582       assert(VM_Version::supports_avx512_bitalg(), "");
5583       evpopcntb(dst, mask, src, merge, vec_enc);
5584       break;
5585     default:
5586       fatal("Unsupported type %s", type2name(bt));
5587       break;
5588   }
5589 }
5590 
5591 #ifndef _LP64
5592 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
5593   assert(VM_Version::supports_avx512bw(), "");
5594   kmovdl(tmp, src);
5595   kunpckdql(dst, tmp, tmp);
5596 }
5597 #endif
5598 
5599 // Bit reversal algorithm first reverses the bits of each byte followed by
5600 // a byte level reversal for multi-byte primitive types (short/int/long).
5601 // Algorithm performs a lookup table access to get reverse bit sequence
5602 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5603 // is obtained by swapping the reverse bit sequences of upper and lower
5604 // nibble of a byte.
5605 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5606                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5607   if (VM_Version::supports_avx512vlbw()) {
5608 
5609     // Get the reverse bit sequence of lower nibble of each byte.
5610     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5611     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5612     evpandq(dst, xtmp2, src, vec_enc);
5613     vpshufb(dst, xtmp1, dst, vec_enc);
5614     vpsllq(dst, dst, 4, vec_enc);
5615 
5616     // Get the reverse bit sequence of upper nibble of each byte.
5617     vpandn(xtmp2, xtmp2, src, vec_enc);
5618     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5619     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5620 
5621     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5622     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5623     evporq(xtmp2, dst, xtmp2, vec_enc);
5624     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5625 
5626   } else if(vec_enc == Assembler::AVX_512bit) {
5627     // Shift based bit reversal.
5628     assert(bt == T_LONG || bt == T_INT, "");
5629 
5630     // Swap lower and upper nibble of each byte.
5631     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5632 
5633     // Swap two least and most significant bits of each nibble.
5634     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5635 
5636     // Swap adjacent pair of bits.
5637     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5638     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5639 
5640     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5641     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5642   } else {
5643     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5644     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5645 
5646     // Get the reverse bit sequence of lower nibble of each byte.
5647     vpand(dst, xtmp2, src, vec_enc);
5648     vpshufb(dst, xtmp1, dst, vec_enc);
5649     vpsllq(dst, dst, 4, vec_enc);
5650 
5651     // Get the reverse bit sequence of upper nibble of each byte.
5652     vpandn(xtmp2, xtmp2, src, vec_enc);
5653     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5654     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5655 
5656     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5657     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5658     vpor(xtmp2, dst, xtmp2, vec_enc);
5659     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5660   }
5661 }
5662 
5663 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5664                                                 XMMRegister xtmp, Register rscratch) {
5665   assert(VM_Version::supports_gfni(), "");
5666   assert(rscratch != noreg || always_reachable(mask), "missing");
5667 
5668   // Galois field instruction based bit reversal based on following algorithm.
5669   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5670   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5671   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5672   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5673 }
5674 
5675 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5676                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5677   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5678   evpandq(dst, xtmp1, src, vec_enc);
5679   vpsllq(dst, dst, nbits, vec_enc);
5680   vpandn(xtmp1, xtmp1, src, vec_enc);
5681   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5682   evporq(dst, dst, xtmp1, vec_enc);
5683 }
5684 
5685 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5686                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5687   // Shift based bit reversal.
5688   assert(VM_Version::supports_evex(), "");
5689   switch(bt) {
5690     case T_LONG:
5691       // Swap upper and lower double word of each quad word.
5692       evprorq(xtmp1, k0, src, 32, true, vec_enc);
5693       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5694       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5695       break;
5696     case T_INT:
5697       // Swap upper and lower word of each double word.
5698       evprord(xtmp1, k0, src, 16, true, vec_enc);
5699       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5700       break;
5701     case T_CHAR:
5702     case T_SHORT:
5703       // Swap upper and lower byte of each word.
5704       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
5705       break;
5706     case T_BYTE:
5707       evmovdquq(dst, k0, src, true, vec_enc);
5708       break;
5709     default:
5710       fatal("Unsupported type %s", type2name(bt));
5711       break;
5712   }
5713 }
5714 
5715 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5716   if (bt == T_BYTE) {
5717     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
5718       evmovdquq(dst, k0, src, true, vec_enc);
5719     } else {
5720       vmovdqu(dst, src);
5721     }
5722     return;
5723   }
5724   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
5725   // pre-computed shuffle indices.
5726   switch(bt) {
5727     case T_LONG:
5728       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
5729       break;
5730     case T_INT:
5731       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
5732       break;
5733     case T_CHAR:
5734     case T_SHORT:
5735       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
5736       break;
5737     default:
5738       fatal("Unsupported type %s", type2name(bt));
5739       break;
5740   }
5741   vpshufb(dst, src, dst, vec_enc);
5742 }
5743 
5744 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5745                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5746                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
5747   assert(is_integral_type(bt), "");
5748   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5749   assert(VM_Version::supports_avx512cd(), "");
5750   switch(bt) {
5751     case T_LONG:
5752       evplzcntq(dst, ktmp, src, merge, vec_enc);
5753       break;
5754     case T_INT:
5755       evplzcntd(dst, ktmp, src, merge, vec_enc);
5756       break;
5757     case T_SHORT:
5758       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
5759       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
5760       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
5761       vpunpckhwd(dst, xtmp1, src, vec_enc);
5762       evplzcntd(dst, ktmp, dst, merge, vec_enc);
5763       vpackusdw(dst, xtmp2, dst, vec_enc);
5764       break;
5765     case T_BYTE:
5766       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5767       // accessing the lookup table.
5768       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5769       // accessing the lookup table.
5770       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5771       assert(VM_Version::supports_avx512bw(), "");
5772       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
5773       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
5774       vpand(xtmp2, dst, src, vec_enc);
5775       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5776       vpsrlw(xtmp3, src, 4, vec_enc);
5777       vpand(xtmp3, dst, xtmp3, vec_enc);
5778       vpshufb(dst, xtmp1, xtmp3, vec_enc);
5779       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5780       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
5781       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
5782       break;
5783     default:
5784       fatal("Unsupported type %s", type2name(bt));
5785       break;
5786   }
5787 }
5788 
5789 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5790                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5791   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
5792   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5793   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5794   // accessing the lookup table.
5795   vpand(dst, xtmp2, src, vec_enc);
5796   vpshufb(dst, xtmp1, dst, vec_enc);
5797   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5798   // accessing the lookup table.
5799   vpsrlw(xtmp3, src, 4, vec_enc);
5800   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
5801   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
5802   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5803   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5804   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
5805   vpaddb(dst, dst, xtmp2, vec_enc);
5806   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
5807 }
5808 
5809 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5810                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5811   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5812   // Add zero counts of lower byte and upper byte of a word if
5813   // upper byte holds a zero value.
5814   vpsrlw(xtmp3, src, 8, vec_enc);
5815   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
5816   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
5817   vpsllw(xtmp2, dst, 8, vec_enc);
5818   vpaddw(xtmp2, xtmp2, dst, vec_enc);
5819   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5820   vpsrlw(dst, dst, 8, vec_enc);
5821 }
5822 
5823 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5824                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
5825   // Since IEEE 754 floating point format represents mantissa in 1.0 format
5826   // hence biased exponent can be used to compute leading zero count as per
5827   // following formula:-
5828   // LZCNT = 32 - (biased_exp - 127)
5829   // Special handling has been introduced for Zero, Max_Int and -ve source values.
5830 
5831   // Broadcast 0xFF
5832   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
5833   vpsrld(xtmp1, xtmp1, 24, vec_enc);
5834 
5835   // Extract biased exponent.
5836   vcvtdq2ps(dst, src, vec_enc);
5837   vpsrld(dst, dst, 23, vec_enc);
5838   vpand(dst, dst, xtmp1, vec_enc);
5839 
5840   // Broadcast 127.
5841   vpsrld(xtmp1, xtmp1, 1, vec_enc);
5842   // Exponent = biased_exp - 127
5843   vpsubd(dst, dst, xtmp1, vec_enc);
5844 
5845   // Exponent = Exponent  + 1
5846   vpsrld(xtmp3, xtmp1, 6, vec_enc);
5847   vpaddd(dst, dst, xtmp3, vec_enc);
5848 
5849   // Replace -ve exponent with zero, exponent is -ve when src
5850   // lane contains a zero value.
5851   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5852   vblendvps(dst, dst, xtmp2, dst, vec_enc);
5853 
5854   // Rematerialize broadcast 32.
5855   vpslld(xtmp1, xtmp3, 5, vec_enc);
5856   // Exponent is 32 if corresponding source lane contains max_int value.
5857   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
5858   // LZCNT = 32 - exponent
5859   vpsubd(dst, xtmp1, dst, vec_enc);
5860 
5861   // Replace LZCNT with a value 1 if corresponding source lane
5862   // contains max_int value.
5863   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
5864 
5865   // Replace biased_exp with 0 if source lane value is less than zero.
5866   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5867   vblendvps(dst, dst, xtmp2, src, vec_enc);
5868 }
5869 
5870 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5871                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5872   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5873   // Add zero counts of lower word and upper word of a double word if
5874   // upper word holds a zero value.
5875   vpsrld(xtmp3, src, 16, vec_enc);
5876   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
5877   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
5878   vpslld(xtmp2, dst, 16, vec_enc);
5879   vpaddd(xtmp2, xtmp2, dst, vec_enc);
5880   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5881   vpsrld(dst, dst, 16, vec_enc);
5882   // Add zero counts of lower doubleword and upper doubleword of a
5883   // quadword if upper doubleword holds a zero value.
5884   vpsrlq(xtmp3, src, 32, vec_enc);
5885   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
5886   vpsllq(xtmp2, dst, 32, vec_enc);
5887   vpaddq(xtmp2, xtmp2, dst, vec_enc);
5888   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5889   vpsrlq(dst, dst, 32, vec_enc);
5890 }
5891 
5892 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
5893                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5894                                                        Register rtmp, int vec_enc) {
5895   assert(is_integral_type(bt), "unexpected type");
5896   assert(vec_enc < Assembler::AVX_512bit, "");
5897   switch(bt) {
5898     case T_LONG:
5899       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5900       break;
5901     case T_INT:
5902       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
5903       break;
5904     case T_SHORT:
5905       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5906       break;
5907     case T_BYTE:
5908       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5909       break;
5910     default:
5911       fatal("Unsupported type %s", type2name(bt));
5912       break;
5913   }
5914 }
5915 
5916 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
5917   switch(bt) {
5918     case T_BYTE:
5919       vpsubb(dst, src1, src2, vec_enc);
5920       break;
5921     case T_SHORT:
5922       vpsubw(dst, src1, src2, vec_enc);
5923       break;
5924     case T_INT:
5925       vpsubd(dst, src1, src2, vec_enc);
5926       break;
5927     case T_LONG:
5928       vpsubq(dst, src1, src2, vec_enc);
5929       break;
5930     default:
5931       fatal("Unsupported type %s", type2name(bt));
5932       break;
5933   }
5934 }
5935 
5936 // Trailing zero count computation is based on leading zero count operation as per
5937 // following equation. All AVX3 targets support AVX512CD feature which offers
5938 // direct vector instruction to compute leading zero count.
5939 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
5940 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5941                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5942                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
5943   assert(is_integral_type(bt), "");
5944   // xtmp = -1
5945   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
5946   // xtmp = xtmp + src
5947   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
5948   // xtmp = xtmp & ~src
5949   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
5950   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
5951   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
5952   vpsub(bt, dst, xtmp4, dst, vec_enc);
5953 }
5954 
5955 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
5956 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
5957 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5958                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5959   assert(is_integral_type(bt), "");
5960   // xtmp = 0
5961   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
5962   // xtmp = 0 - src
5963   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
5964   // xtmp = xtmp | src
5965   vpor(xtmp3, xtmp3, src, vec_enc);
5966   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
5967   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
5968   vpsub(bt, dst, xtmp1, dst, vec_enc);
5969 }
5970 
5971 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
5972   Label done;
5973   Label neg_divisor_fastpath;
5974   cmpl(divisor, 0);
5975   jccb(Assembler::less, neg_divisor_fastpath);
5976   xorl(rdx, rdx);
5977   divl(divisor);
5978   jmpb(done);
5979   bind(neg_divisor_fastpath);
5980   // Fastpath for divisor < 0:
5981   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
5982   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
5983   movl(rdx, rax);
5984   subl(rdx, divisor);
5985   if (VM_Version::supports_bmi1()) {
5986     andnl(rax, rdx, rax);
5987   } else {
5988     notl(rdx);
5989     andl(rax, rdx);
5990   }
5991   shrl(rax, 31);
5992   bind(done);
5993 }
5994 
5995 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
5996   Label done;
5997   Label neg_divisor_fastpath;
5998   cmpl(divisor, 0);
5999   jccb(Assembler::less, neg_divisor_fastpath);
6000   xorl(rdx, rdx);
6001   divl(divisor);
6002   jmpb(done);
6003   bind(neg_divisor_fastpath);
6004   // Fastpath when divisor < 0:
6005   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6006   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6007   movl(rdx, rax);
6008   subl(rax, divisor);
6009   if (VM_Version::supports_bmi1()) {
6010     andnl(rax, rax, rdx);
6011   } else {
6012     notl(rax);
6013     andl(rax, rdx);
6014   }
6015   sarl(rax, 31);
6016   andl(rax, divisor);
6017   subl(rdx, rax);
6018   bind(done);
6019 }
6020 
6021 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6022   Label done;
6023   Label neg_divisor_fastpath;
6024 
6025   cmpl(divisor, 0);
6026   jccb(Assembler::less, neg_divisor_fastpath);
6027   xorl(rdx, rdx);
6028   divl(divisor);
6029   jmpb(done);
6030   bind(neg_divisor_fastpath);
6031   // Fastpath for divisor < 0:
6032   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6033   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6034   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6035   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6036   movl(rdx, rax);
6037   subl(rax, divisor);
6038   if (VM_Version::supports_bmi1()) {
6039     andnl(rax, rax, rdx);
6040   } else {
6041     notl(rax);
6042     andl(rax, rdx);
6043   }
6044   movl(tmp, rax);
6045   shrl(rax, 31); // quotient
6046   sarl(tmp, 31);
6047   andl(tmp, divisor);
6048   subl(rdx, tmp); // remainder
6049   bind(done);
6050 }
6051 
6052 #ifdef _LP64
6053 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6054                                  XMMRegister xtmp2, Register rtmp) {
6055   if(VM_Version::supports_gfni()) {
6056     // Galois field instruction based bit reversal based on following algorithm.
6057     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6058     mov64(rtmp, 0x8040201008040201L);
6059     movq(xtmp1, src);
6060     movq(xtmp2, rtmp);
6061     gf2p8affineqb(xtmp1, xtmp2, 0);
6062     movq(dst, xtmp1);
6063   } else {
6064     // Swap even and odd numbered bits.
6065     movl(rtmp, src);
6066     andl(rtmp, 0x55555555);
6067     shll(rtmp, 1);
6068     movl(dst, src);
6069     andl(dst, 0xAAAAAAAA);
6070     shrl(dst, 1);
6071     orl(dst, rtmp);
6072 
6073     // Swap LSB and MSB 2 bits of each nibble.
6074     movl(rtmp, dst);
6075     andl(rtmp, 0x33333333);
6076     shll(rtmp, 2);
6077     andl(dst, 0xCCCCCCCC);
6078     shrl(dst, 2);
6079     orl(dst, rtmp);
6080 
6081     // Swap LSB and MSB 4 bits of each byte.
6082     movl(rtmp, dst);
6083     andl(rtmp, 0x0F0F0F0F);
6084     shll(rtmp, 4);
6085     andl(dst, 0xF0F0F0F0);
6086     shrl(dst, 4);
6087     orl(dst, rtmp);
6088   }
6089   bswapl(dst);
6090 }
6091 
6092 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6093                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6094   if(VM_Version::supports_gfni()) {
6095     // Galois field instruction based bit reversal based on following algorithm.
6096     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6097     mov64(rtmp1, 0x8040201008040201L);
6098     movq(xtmp1, src);
6099     movq(xtmp2, rtmp1);
6100     gf2p8affineqb(xtmp1, xtmp2, 0);
6101     movq(dst, xtmp1);
6102   } else {
6103     // Swap even and odd numbered bits.
6104     movq(rtmp1, src);
6105     mov64(rtmp2, 0x5555555555555555L);
6106     andq(rtmp1, rtmp2);
6107     shlq(rtmp1, 1);
6108     movq(dst, src);
6109     notq(rtmp2);
6110     andq(dst, rtmp2);
6111     shrq(dst, 1);
6112     orq(dst, rtmp1);
6113 
6114     // Swap LSB and MSB 2 bits of each nibble.
6115     movq(rtmp1, dst);
6116     mov64(rtmp2, 0x3333333333333333L);
6117     andq(rtmp1, rtmp2);
6118     shlq(rtmp1, 2);
6119     notq(rtmp2);
6120     andq(dst, rtmp2);
6121     shrq(dst, 2);
6122     orq(dst, rtmp1);
6123 
6124     // Swap LSB and MSB 4 bits of each byte.
6125     movq(rtmp1, dst);
6126     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6127     andq(rtmp1, rtmp2);
6128     shlq(rtmp1, 4);
6129     notq(rtmp2);
6130     andq(dst, rtmp2);
6131     shrq(dst, 4);
6132     orq(dst, rtmp1);
6133   }
6134   bswapq(dst);
6135 }
6136 
6137 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6138   Label done;
6139   Label neg_divisor_fastpath;
6140   cmpq(divisor, 0);
6141   jccb(Assembler::less, neg_divisor_fastpath);
6142   xorl(rdx, rdx);
6143   divq(divisor);
6144   jmpb(done);
6145   bind(neg_divisor_fastpath);
6146   // Fastpath for divisor < 0:
6147   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6148   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6149   movq(rdx, rax);
6150   subq(rdx, divisor);
6151   if (VM_Version::supports_bmi1()) {
6152     andnq(rax, rdx, rax);
6153   } else {
6154     notq(rdx);
6155     andq(rax, rdx);
6156   }
6157   shrq(rax, 63);
6158   bind(done);
6159 }
6160 
6161 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6162   Label done;
6163   Label neg_divisor_fastpath;
6164   cmpq(divisor, 0);
6165   jccb(Assembler::less, neg_divisor_fastpath);
6166   xorq(rdx, rdx);
6167   divq(divisor);
6168   jmp(done);
6169   bind(neg_divisor_fastpath);
6170   // Fastpath when divisor < 0:
6171   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6172   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6173   movq(rdx, rax);
6174   subq(rax, divisor);
6175   if (VM_Version::supports_bmi1()) {
6176     andnq(rax, rax, rdx);
6177   } else {
6178     notq(rax);
6179     andq(rax, rdx);
6180   }
6181   sarq(rax, 63);
6182   andq(rax, divisor);
6183   subq(rdx, rax);
6184   bind(done);
6185 }
6186 
6187 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6188   Label done;
6189   Label neg_divisor_fastpath;
6190   cmpq(divisor, 0);
6191   jccb(Assembler::less, neg_divisor_fastpath);
6192   xorq(rdx, rdx);
6193   divq(divisor);
6194   jmp(done);
6195   bind(neg_divisor_fastpath);
6196   // Fastpath for divisor < 0:
6197   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6198   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6199   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6200   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6201   movq(rdx, rax);
6202   subq(rax, divisor);
6203   if (VM_Version::supports_bmi1()) {
6204     andnq(rax, rax, rdx);
6205   } else {
6206     notq(rax);
6207     andq(rax, rdx);
6208   }
6209   movq(tmp, rax);
6210   shrq(rax, 63); // quotient
6211   sarq(tmp, 63);
6212   andq(tmp, divisor);
6213   subq(rdx, tmp); // remainder
6214   bind(done);
6215 }
6216 #endif
6217 
6218 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6219                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6220                                         int vlen_enc) {
6221   assert(VM_Version::supports_avx512bw(), "");
6222   // Byte shuffles are inlane operations and indices are determined using
6223   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6224   // normalized to index range 0-15. This makes sure that all the multiples
6225   // of an index value are placed at same relative position in 128 bit
6226   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6227   // will be 16th element in their respective 128 bit lanes.
6228   movl(rtmp, 16);
6229   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6230 
6231   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6232   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6233   // original shuffle indices and move the shuffled lanes corresponding to true
6234   // mask to destination vector.
6235   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6236   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6237   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6238 
6239   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6240   // and broadcasting second 128 bit lane.
6241   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6242   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6243   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6244   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6245   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6246 
6247   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6248   // and broadcasting third 128 bit lane.
6249   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6250   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6251   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6252   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6253   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6254 
6255   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6256   // and broadcasting third 128 bit lane.
6257   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6258   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6259   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6260   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6261   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6262 }
6263 
6264 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6265                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6266   if (vlen_enc == AVX_128bit) {
6267     vpermilps(dst, src, shuffle, vlen_enc);
6268   } else if (bt == T_INT) {
6269     vpermd(dst, shuffle, src, vlen_enc);
6270   } else {
6271     assert(bt == T_FLOAT, "");
6272     vpermps(dst, shuffle, src, vlen_enc);
6273   }
6274 }