1 /*
   2  * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 #include "utilities/checkedCast.hpp"
  39 
  40 #ifdef PRODUCT
  41 #define BLOCK_COMMENT(str) /* nothing */
  42 #define STOP(error) stop(error)
  43 #else
  44 #define BLOCK_COMMENT(str) block_comment(str)
  45 #define STOP(error) block_comment(error); stop(error)
  46 #endif
  47 
  48 // C2 compiled method's prolog code.
  49 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  50 
  51   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  52   // NativeJump::patch_verified_entry will be able to patch out the entry
  53   // code safely. The push to verify stack depth is ok at 5 bytes,
  54   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  55   // stack bang then we must use the 6 byte frame allocation even if
  56   // we have no frame. :-(
  57   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  58 
  59   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  60   // Remove word for return addr
  61   framesize -= wordSize;
  62   stack_bang_size -= wordSize;
  63 
  64   // Calls to C2R adapters often do not accept exceptional returns.
  65   // We require that their callers must bang for them.  But be careful, because
  66   // some VM calls (such as call site linkage) can use several kilobytes of
  67   // stack.  But the stack safety zone should account for that.
  68   // See bugs 4446381, 4468289, 4497237.
  69   if (stack_bang_size > 0) {
  70     generate_stack_overflow_check(stack_bang_size);
  71 
  72     // We always push rbp, so that on return to interpreter rbp, will be
  73     // restored correctly and we can correct the stack.
  74     push(rbp);
  75     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  76     if (PreserveFramePointer) {
  77       mov(rbp, rsp);
  78     }
  79     // Remove word for ebp
  80     framesize -= wordSize;
  81 
  82     // Create frame
  83     if (framesize) {
  84       subptr(rsp, framesize);
  85     }
  86   } else {
  87     // Create frame (force generation of a 4 byte immediate value)
  88     subptr_imm32(rsp, framesize);
  89 
  90     // Save RBP register now.
  91     framesize -= wordSize;
  92     movptr(Address(rsp, framesize), rbp);
  93     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  94     if (PreserveFramePointer) {
  95       movptr(rbp, rsp);
  96       if (framesize > 0) {
  97         addptr(rbp, framesize);
  98       }
  99     }
 100   }
 101 
 102   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 103     framesize -= wordSize;
 104     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 105   }
 106 
 107 #ifndef _LP64
 108   // If method sets FPU control word do it now
 109   if (fp_mode_24b) {
 110     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 111   }
 112   if (UseSSE >= 2 && VerifyFPU) {
 113     verify_FPU(0, "FPU stack must be clean on entry");
 114   }
 115 #endif
 116 
 117 #ifdef ASSERT
 118   if (VerifyStackAtCalls) {
 119     Label L;
 120     push(rax);
 121     mov(rax, rsp);
 122     andptr(rax, StackAlignmentInBytes-1);
 123     cmpptr(rax, StackAlignmentInBytes-wordSize);
 124     pop(rax);
 125     jcc(Assembler::equal, L);
 126     STOP("Stack is not properly aligned!");
 127     bind(L);
 128   }
 129 #endif
 130 
 131   if (!is_stub) {
 132     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 133  #ifdef _LP64
 134     if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
 135       // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 136       Label dummy_slow_path;
 137       Label dummy_continuation;
 138       Label* slow_path = &dummy_slow_path;
 139       Label* continuation = &dummy_continuation;
 140       if (!Compile::current()->output()->in_scratch_emit_size()) {
 141         // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 142         C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 143         Compile::current()->output()->add_stub(stub);
 144         slow_path = &stub->entry();
 145         continuation = &stub->continuation();
 146       }
 147       bs->nmethod_entry_barrier(this, slow_path, continuation);
 148     }
 149 #else
 150     // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 151     bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */);
 152 #endif
 153   }
 154 }
 155 
 156 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 157   switch (vlen_in_bytes) {
 158     case  4: // fall-through
 159     case  8: // fall-through
 160     case 16: return Assembler::AVX_128bit;
 161     case 32: return Assembler::AVX_256bit;
 162     case 64: return Assembler::AVX_512bit;
 163 
 164     default: {
 165       ShouldNotReachHere();
 166       return Assembler::AVX_NoVec;
 167     }
 168   }
 169 }
 170 
 171 #if INCLUDE_RTM_OPT
 172 
 173 // Update rtm_counters based on abort status
 174 // input: abort_status
 175 //        rtm_counters (RTMLockingCounters*)
 176 // flags are killed
 177 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
 178 
 179   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
 180   if (PrintPreciseRTMLockingStatistics) {
 181     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
 182       Label check_abort;
 183       testl(abort_status, (1<<i));
 184       jccb(Assembler::equal, check_abort);
 185       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
 186       bind(check_abort);
 187     }
 188   }
 189 }
 190 
 191 // Branch if (random & (count-1) != 0), count is 2^n
 192 // tmp, scr and flags are killed
 193 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
 194   assert(tmp == rax, "");
 195   assert(scr == rdx, "");
 196   rdtsc(); // modifies EDX:EAX
 197   andptr(tmp, count-1);
 198   jccb(Assembler::notZero, brLabel);
 199 }
 200 
 201 // Perform abort ratio calculation, set no_rtm bit if high ratio
 202 // input:  rtm_counters_Reg (RTMLockingCounters* address)
 203 // tmpReg, rtm_counters_Reg and flags are killed
 204 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
 205                                                     Register rtm_counters_Reg,
 206                                                     RTMLockingCounters* rtm_counters,
 207                                                     Metadata* method_data) {
 208   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 209 
 210   if (RTMLockingCalculationDelay > 0) {
 211     // Delay calculation
 212     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()));
 213     testptr(tmpReg, tmpReg);
 214     jccb(Assembler::equal, L_done);
 215   }
 216   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 217   //   Aborted transactions = abort_count * 100
 218   //   All transactions = total_count *  RTMTotalCountIncrRate
 219   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 220 
 221   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 222   cmpptr(tmpReg, RTMAbortThreshold);
 223   jccb(Assembler::below, L_check_always_rtm2);
 224   imulptr(tmpReg, tmpReg, 100);
 225 
 226   Register scrReg = rtm_counters_Reg;
 227   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 228   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 229   imulptr(scrReg, scrReg, RTMAbortRatio);
 230   cmpptr(tmpReg, scrReg);
 231   jccb(Assembler::below, L_check_always_rtm1);
 232   if (method_data != nullptr) {
 233     // set rtm_state to "no rtm" in MDO
 234     mov_metadata(tmpReg, method_data);
 235     lock();
 236     orl(Address(tmpReg, MethodData::rtm_state_offset()), NoRTM);
 237   }
 238   jmpb(L_done);
 239   bind(L_check_always_rtm1);
 240   // Reload RTMLockingCounters* address
 241   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 242   bind(L_check_always_rtm2);
 243   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 244   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 245   jccb(Assembler::below, L_done);
 246   if (method_data != nullptr) {
 247     // set rtm_state to "always rtm" in MDO
 248     mov_metadata(tmpReg, method_data);
 249     lock();
 250     orl(Address(tmpReg, MethodData::rtm_state_offset()), UseRTM);
 251   }
 252   bind(L_done);
 253 }
 254 
 255 // Update counters and perform abort ratio calculation
 256 // input:  abort_status_Reg
 257 // rtm_counters_Reg, flags are killed
 258 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 259                                       Register rtm_counters_Reg,
 260                                       RTMLockingCounters* rtm_counters,
 261                                       Metadata* method_data,
 262                                       bool profile_rtm) {
 263 
 264   assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 265   // update rtm counters based on rax value at abort
 266   // reads abort_status_Reg, updates flags
 267   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 268   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 269   if (profile_rtm) {
 270     // Save abort status because abort_status_Reg is used by following code.
 271     if (RTMRetryCount > 0) {
 272       push(abort_status_Reg);
 273     }
 274     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 275     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 276     // restore abort status
 277     if (RTMRetryCount > 0) {
 278       pop(abort_status_Reg);
 279     }
 280   }
 281 }
 282 
 283 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 284 // inputs: retry_count_Reg
 285 //       : abort_status_Reg
 286 // output: retry_count_Reg decremented by 1
 287 // flags are killed
 288 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 289   Label doneRetry;
 290   assert(abort_status_Reg == rax, "");
 291   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 292   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 293   // if reason is in 0x6 and retry count != 0 then retry
 294   andptr(abort_status_Reg, 0x6);
 295   jccb(Assembler::zero, doneRetry);
 296   testl(retry_count_Reg, retry_count_Reg);
 297   jccb(Assembler::zero, doneRetry);
 298   pause();
 299   decrementl(retry_count_Reg);
 300   jmp(retryLabel);
 301   bind(doneRetry);
 302 }
 303 
 304 // Spin and retry if lock is busy,
 305 // inputs: box_Reg (monitor address)
 306 //       : retry_count_Reg
 307 // output: retry_count_Reg decremented by 1
 308 //       : clear z flag if retry count exceeded
 309 // tmp_Reg, scr_Reg, flags are killed
 310 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 311                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 312   Label SpinLoop, SpinExit, doneRetry;
 313   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 314 
 315   testl(retry_count_Reg, retry_count_Reg);
 316   jccb(Assembler::zero, doneRetry);
 317   decrementl(retry_count_Reg);
 318   movptr(scr_Reg, RTMSpinLoopCount);
 319 
 320   bind(SpinLoop);
 321   pause();
 322   decrementl(scr_Reg);
 323   jccb(Assembler::lessEqual, SpinExit);
 324   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 325   testptr(tmp_Reg, tmp_Reg);
 326   jccb(Assembler::notZero, SpinLoop);
 327 
 328   bind(SpinExit);
 329   jmp(retryLabel);
 330   bind(doneRetry);
 331   incrementl(retry_count_Reg); // clear z flag
 332 }
 333 
 334 // Use RTM for normal stack locks
 335 // Input: objReg (object to lock)
 336 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 337                                          Register retry_on_abort_count_Reg,
 338                                          RTMLockingCounters* stack_rtm_counters,
 339                                          Metadata* method_data, bool profile_rtm,
 340                                          Label& DONE_LABEL, Label& IsInflated) {
 341   assert(UseRTMForStackLocks, "why call this otherwise?");
 342   assert(tmpReg == rax, "");
 343   assert(scrReg == rdx, "");
 344   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 345 
 346   if (RTMRetryCount > 0) {
 347     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 348     bind(L_rtm_retry);
 349   }
 350   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 351   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral
 352   jcc(Assembler::notZero, IsInflated);
 353 
 354   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 355     Label L_noincrement;
 356     if (RTMTotalCountIncrRate > 1) {
 357       // tmpReg, scrReg and flags are killed
 358       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 359     }
 360     assert(stack_rtm_counters != nullptr, "should not be null when profiling RTM");
 361     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 362     bind(L_noincrement);
 363   }
 364   xbegin(L_on_abort);
 365   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 366   andptr(tmpReg, markWord::lock_mask_in_place);     // look at 2 lock bits
 367   cmpptr(tmpReg, markWord::unlocked_value);         // bits = 01 unlocked
 368   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 369 
 370   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 371   if (UseRTMXendForLockBusy) {
 372     xend();
 373     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 374     jmp(L_decrement_retry);
 375   }
 376   else {
 377     xabort(0);
 378   }
 379   bind(L_on_abort);
 380   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 381     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 382   }
 383   bind(L_decrement_retry);
 384   if (RTMRetryCount > 0) {
 385     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 386     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 387   }
 388 }
 389 
 390 // Use RTM for inflating locks
 391 // inputs: objReg (object to lock)
 392 //         boxReg (on-stack box address (displaced header location) - KILLED)
 393 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 394 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 395                                             Register scrReg, Register retry_on_busy_count_Reg,
 396                                             Register retry_on_abort_count_Reg,
 397                                             RTMLockingCounters* rtm_counters,
 398                                             Metadata* method_data, bool profile_rtm,
 399                                             Label& DONE_LABEL) {
 400   assert(UseRTMLocking, "why call this otherwise?");
 401   assert(tmpReg == rax, "");
 402   assert(scrReg == rdx, "");
 403   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 404   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 405 
 406   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 407   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 408 
 409   if (RTMRetryCount > 0) {
 410     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 411     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 412     bind(L_rtm_retry);
 413   }
 414   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 415     Label L_noincrement;
 416     if (RTMTotalCountIncrRate > 1) {
 417       // tmpReg, scrReg and flags are killed
 418       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 419     }
 420     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 421     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 422     bind(L_noincrement);
 423   }
 424   xbegin(L_on_abort);
 425   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 426   movptr(tmpReg, Address(tmpReg, owner_offset));
 427   testptr(tmpReg, tmpReg);
 428   jcc(Assembler::zero, DONE_LABEL);
 429   if (UseRTMXendForLockBusy) {
 430     xend();
 431     jmp(L_decrement_retry);
 432   }
 433   else {
 434     xabort(0);
 435   }
 436   bind(L_on_abort);
 437   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 438   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 439     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 440   }
 441   if (RTMRetryCount > 0) {
 442     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 443     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 444   }
 445 
 446   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 447   testptr(tmpReg, tmpReg) ;
 448   jccb(Assembler::notZero, L_decrement_retry) ;
 449 
 450   // Appears unlocked - try to swing _owner from null to non-null.
 451   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 452 #ifdef _LP64
 453   Register threadReg = r15_thread;
 454 #else
 455   get_thread(scrReg);
 456   Register threadReg = scrReg;
 457 #endif
 458   lock();
 459   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 460 
 461   if (RTMRetryCount > 0) {
 462     // success done else retry
 463     jccb(Assembler::equal, DONE_LABEL) ;
 464     bind(L_decrement_retry);
 465     // Spin and retry if lock is busy.
 466     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 467   }
 468   else {
 469     bind(L_decrement_retry);
 470   }
 471 }
 472 
 473 #endif //  INCLUDE_RTM_OPT
 474 
 475 // fast_lock and fast_unlock used by C2
 476 
 477 // Because the transitions from emitted code to the runtime
 478 // monitorenter/exit helper stubs are so slow it's critical that
 479 // we inline both the stack-locking fast path and the inflated fast path.
 480 //
 481 // See also: cmpFastLock and cmpFastUnlock.
 482 //
 483 // What follows is a specialized inline transliteration of the code
 484 // in enter() and exit(). If we're concerned about I$ bloat another
 485 // option would be to emit TrySlowEnter and TrySlowExit methods
 486 // at startup-time.  These methods would accept arguments as
 487 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 488 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 489 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 490 // In practice, however, the # of lock sites is bounded and is usually small.
 491 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 492 // if the processor uses simple bimodal branch predictors keyed by EIP
 493 // Since the helper routines would be called from multiple synchronization
 494 // sites.
 495 //
 496 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 497 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 498 // to those specialized methods.  That'd give us a mostly platform-independent
 499 // implementation that the JITs could optimize and inline at their pleasure.
 500 // Done correctly, the only time we'd need to cross to native could would be
 501 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 502 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 503 // (b) explicit barriers or fence operations.
 504 //
 505 // TODO:
 506 //
 507 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 508 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 509 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 510 //    the lock operators would typically be faster than reifying Self.
 511 //
 512 // *  Ideally I'd define the primitives as:
 513 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 514 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 515 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 516 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 517 //    Furthermore the register assignments are overconstrained, possibly resulting in
 518 //    sub-optimal code near the synchronization site.
 519 //
 520 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 521 //    Alternately, use a better sp-proximity test.
 522 //
 523 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 524 //    Either one is sufficient to uniquely identify a thread.
 525 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 526 //
 527 // *  Intrinsify notify() and notifyAll() for the common cases where the
 528 //    object is locked by the calling thread but the waitlist is empty.
 529 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 530 //
 531 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 532 //    But beware of excessive branch density on AMD Opterons.
 533 //
 534 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 535 //    or failure of the fast path.  If the fast path fails then we pass
 536 //    control to the slow path, typically in C.  In fast_lock and
 537 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 538 //    will emit a conditional branch immediately after the node.
 539 //    So we have branches to branches and lots of ICC.ZF games.
 540 //    Instead, it might be better to have C2 pass a "FailureLabel"
 541 //    into fast_lock and fast_unlock.  In the case of success, control
 542 //    will drop through the node.  ICC.ZF is undefined at exit.
 543 //    In the case of failure, the node will branch directly to the
 544 //    FailureLabel
 545 
 546 
 547 // obj: object to lock
 548 // box: on-stack box address (displaced header location) - KILLED
 549 // rax,: tmp -- KILLED
 550 // scr: tmp -- KILLED
 551 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 552                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 553                                  RTMLockingCounters* rtm_counters,
 554                                  RTMLockingCounters* stack_rtm_counters,
 555                                  Metadata* method_data,
 556                                  bool use_rtm, bool profile_rtm) {
 557   // Ensure the register assignments are disjoint
 558   assert(tmpReg == rax, "");
 559 
 560   if (use_rtm) {
 561     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 562   } else {
 563     assert(cx1Reg == noreg, "");
 564     assert(cx2Reg == noreg, "");
 565     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 566   }
 567 
 568   // Possible cases that we'll encounter in fast_lock
 569   // ------------------------------------------------
 570   // * Inflated
 571   //    -- unlocked
 572   //    -- Locked
 573   //       = by self
 574   //       = by other
 575   // * neutral
 576   // * stack-locked
 577   //    -- by self
 578   //       = sp-proximity test hits
 579   //       = sp-proximity test generates false-negative
 580   //    -- by other
 581   //
 582 
 583   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 584 
 585   if (DiagnoseSyncOnValueBasedClasses != 0) {
 586     load_klass(tmpReg, objReg, scrReg);
 587     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 588     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 589     jcc(Assembler::notZero, DONE_LABEL);
 590   }
 591 
 592 #if INCLUDE_RTM_OPT
 593   if (UseRTMForStackLocks && use_rtm) {
 594     assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive");
 595     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 596                       stack_rtm_counters, method_data, profile_rtm,
 597                       DONE_LABEL, IsInflated);
 598   }
 599 #endif // INCLUDE_RTM_OPT
 600 
 601   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 602   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 603   jcc(Assembler::notZero, IsInflated);
 604 
 605   if (LockingMode == LM_MONITOR) {
 606     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 607     testptr(objReg, objReg);
 608   } else if (LockingMode == LM_LEGACY) {
 609     // Attempt stack-locking ...
 610     orptr (tmpReg, markWord::unlocked_value);
 611     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 612     lock();
 613     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 614     jcc(Assembler::equal, COUNT);           // Success
 615 
 616     // Recursive locking.
 617     // The object is stack-locked: markword contains stack pointer to BasicLock.
 618     // Locked by current thread if difference with current SP is less than one page.
 619     subptr(tmpReg, rsp);
 620     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 621     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) );
 622     movptr(Address(boxReg, 0), tmpReg);
 623   } else {
 624     assert(LockingMode == LM_LIGHTWEIGHT, "");
 625     lightweight_lock(objReg, tmpReg, thread, scrReg, NO_COUNT);
 626     jmp(COUNT);
 627   }
 628   jmp(DONE_LABEL);
 629 
 630   bind(IsInflated);
 631   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 632 
 633 #if INCLUDE_RTM_OPT
 634   // Use the same RTM locking code in 32- and 64-bit VM.
 635   if (use_rtm) {
 636     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 637                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 638   } else {
 639 #endif // INCLUDE_RTM_OPT
 640 
 641 #ifndef _LP64
 642   // The object is inflated.
 643 
 644   // boxReg refers to the on-stack BasicLock in the current frame.
 645   // We'd like to write:
 646   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 647   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 648   // additional latency as we have another ST in the store buffer that must drain.
 649 
 650   // avoid ST-before-CAS
 651   // register juggle because we need tmpReg for cmpxchgptr below
 652   movptr(scrReg, boxReg);
 653   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 654 
 655   // Optimistic form: consider XORL tmpReg,tmpReg
 656   movptr(tmpReg, NULL_WORD);
 657 
 658   // Appears unlocked - try to swing _owner from null to non-null.
 659   // Ideally, I'd manifest "Self" with get_thread and then attempt
 660   // to CAS the register containing Self into m->Owner.
 661   // But we don't have enough registers, so instead we can either try to CAS
 662   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 663   // we later store "Self" into m->Owner.  Transiently storing a stack address
 664   // (rsp or the address of the box) into  m->owner is harmless.
 665   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 666   lock();
 667   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 668   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 669   // If we weren't able to swing _owner from null to the BasicLock
 670   // then take the slow path.
 671   jccb  (Assembler::notZero, NO_COUNT);
 672   // update _owner from BasicLock to thread
 673   get_thread (scrReg);                    // beware: clobbers ICCs
 674   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 675   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 676 
 677   // If the CAS fails we can either retry or pass control to the slow path.
 678   // We use the latter tactic.
 679   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 680   // If the CAS was successful ...
 681   //   Self has acquired the lock
 682   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 683   // Intentional fall-through into DONE_LABEL ...
 684 #else // _LP64
 685   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 686   movq(scrReg, tmpReg);
 687   xorq(tmpReg, tmpReg);
 688   lock();
 689   cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 690   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 691   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 692   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 693   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 694   jccb(Assembler::equal, COUNT);          // CAS above succeeded; propagate ZF = 1 (success)
 695 
 696   cmpptr(thread, rax);                // Check if we are already the owner (recursive lock)
 697   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 698   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 699   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 700 #endif // _LP64
 701 #if INCLUDE_RTM_OPT
 702   } // use_rtm()
 703 #endif
 704   bind(DONE_LABEL);
 705 
 706   // ZFlag == 1 count in fast path
 707   // ZFlag == 0 count in slow path
 708   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 709 
 710   bind(COUNT);
 711   // Count monitors in fast path
 712   increment(Address(thread, JavaThread::held_monitor_count_offset()));
 713 
 714   xorl(tmpReg, tmpReg); // Set ZF == 1
 715 
 716   bind(NO_COUNT);
 717 
 718   // At NO_COUNT the icc ZFlag is set as follows ...
 719   // fast_unlock uses the same protocol.
 720   // ZFlag == 1 -> Success
 721   // ZFlag == 0 -> Failure - force control through the slow path
 722 }
 723 
 724 // obj: object to unlock
 725 // box: box address (displaced header location), killed.  Must be EAX.
 726 // tmp: killed, cannot be obj nor box.
 727 //
 728 // Some commentary on balanced locking:
 729 //
 730 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 731 // Methods that don't have provably balanced locking are forced to run in the
 732 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 733 // The interpreter provides two properties:
 734 // I1:  At return-time the interpreter automatically and quietly unlocks any
 735 //      objects acquired the current activation (frame).  Recall that the
 736 //      interpreter maintains an on-stack list of locks currently held by
 737 //      a frame.
 738 // I2:  If a method attempts to unlock an object that is not held by the
 739 //      the frame the interpreter throws IMSX.
 740 //
 741 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 742 // B() doesn't have provably balanced locking so it runs in the interpreter.
 743 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 744 // is still locked by A().
 745 //
 746 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 747 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 748 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 749 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 750 // Arguably given that the spec legislates the JNI case as undefined our implementation
 751 // could reasonably *avoid* checking owner in fast_unlock().
 752 // In the interest of performance we elide m->Owner==Self check in unlock.
 753 // A perfectly viable alternative is to elide the owner check except when
 754 // Xcheck:jni is enabled.
 755 
 756 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 757   assert(boxReg == rax, "");
 758   assert_different_registers(objReg, boxReg, tmpReg);
 759 
 760   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 761 
 762 #if INCLUDE_RTM_OPT
 763   if (UseRTMForStackLocks && use_rtm) {
 764     assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive");
 765     Label L_regular_unlock;
 766     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 767     andptr(tmpReg, markWord::lock_mask_in_place);                     // look at 2 lock bits
 768     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 01 unlocked
 769     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 770     xend();                                                           // otherwise end...
 771     jmp(DONE_LABEL);                                                  // ... and we're done
 772     bind(L_regular_unlock);
 773   }
 774 #endif
 775 
 776   if (LockingMode == LM_LEGACY) {
 777     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 778     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 779   }
 780   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 781   if (LockingMode != LM_MONITOR) {
 782     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 783     jcc(Assembler::zero, Stacked);
 784   }
 785 
 786   // It's inflated.
 787   if (LockingMode == LM_LIGHTWEIGHT) {
 788     // If the owner is ANONYMOUS, we need to fix it -  in an outline stub.
 789     testb(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t) ObjectMonitor::ANONYMOUS_OWNER);
 790 #ifdef _LP64
 791     if (!Compile::current()->output()->in_scratch_emit_size()) {
 792       C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmpReg, boxReg);
 793       Compile::current()->output()->add_stub(stub);
 794       jcc(Assembler::notEqual, stub->entry());
 795       bind(stub->continuation());
 796     } else
 797 #endif
 798     {
 799       // We can't easily implement this optimization on 32 bit because we don't have a thread register.
 800       // Call the slow-path instead.
 801       jcc(Assembler::notEqual, NO_COUNT);
 802     }
 803   }
 804 
 805 #if INCLUDE_RTM_OPT
 806   if (use_rtm) {
 807     Label L_regular_inflated_unlock;
 808     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 809     movptr(boxReg, Address(tmpReg, owner_offset));
 810     testptr(boxReg, boxReg);
 811     jccb(Assembler::notZero, L_regular_inflated_unlock);
 812     xend();
 813     jmp(DONE_LABEL);
 814     bind(L_regular_inflated_unlock);
 815   }
 816 #endif
 817 
 818   // Despite our balanced locking property we still check that m->_owner == Self
 819   // as java routines or native JNI code called by this thread might
 820   // have released the lock.
 821   // Refer to the comments in synchronizer.cpp for how we might encode extra
 822   // state in _succ so we can avoid fetching EntryList|cxq.
 823   //
 824   // If there's no contention try a 1-0 exit.  That is, exit without
 825   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 826   // we detect and recover from the race that the 1-0 exit admits.
 827   //
 828   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 829   // before it STs null into _owner, releasing the lock.  Updates
 830   // to data protected by the critical section must be visible before
 831   // we drop the lock (and thus before any other thread could acquire
 832   // the lock and observe the fields protected by the lock).
 833   // IA32's memory-model is SPO, so STs are ordered with respect to
 834   // each other and there's no need for an explicit barrier (fence).
 835   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 836 #ifndef _LP64
 837   // Note that we could employ various encoding schemes to reduce
 838   // the number of loads below (currently 4) to just 2 or 3.
 839   // Refer to the comments in synchronizer.cpp.
 840   // In practice the chain of fetches doesn't seem to impact performance, however.
 841   xorptr(boxReg, boxReg);
 842   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 843   jccb  (Assembler::notZero, DONE_LABEL);
 844   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 845   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 846   jccb  (Assembler::notZero, DONE_LABEL);
 847   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 848   jmpb  (DONE_LABEL);
 849 #else // _LP64
 850   // It's inflated
 851   Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath;
 852 
 853   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 854   jccb(Assembler::equal, LNotRecursive);
 855 
 856   // Recursive inflated unlock
 857   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 858   jmpb(LSuccess);
 859 
 860   bind(LNotRecursive);
 861   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 862   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 863   jccb  (Assembler::notZero, CheckSucc);
 864   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 865   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 866   jmpb  (DONE_LABEL);
 867 
 868   // Try to avoid passing control into the slow_path ...
 869   bind  (CheckSucc);
 870 
 871   // The following optional optimization can be elided if necessary
 872   // Effectively: if (succ == null) goto slow path
 873   // The code reduces the window for a race, however,
 874   // and thus benefits performance.
 875   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 876   jccb  (Assembler::zero, LGoSlowPath);
 877 
 878   xorptr(boxReg, boxReg);
 879   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 880   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 881 
 882   // Memory barrier/fence
 883   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 884   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 885   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 886   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 887   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 888   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 889   lock(); addl(Address(rsp, 0), 0);
 890 
 891   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 892   jccb  (Assembler::notZero, LSuccess);
 893 
 894   // Rare inopportune interleaving - race.
 895   // The successor vanished in the small window above.
 896   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 897   // We need to ensure progress and succession.
 898   // Try to reacquire the lock.
 899   // If that fails then the new owner is responsible for succession and this
 900   // thread needs to take no further action and can exit via the fast path (success).
 901   // If the re-acquire succeeds then pass control into the slow path.
 902   // As implemented, this latter mode is horrible because we generated more
 903   // coherence traffic on the lock *and* artificially extended the critical section
 904   // length while by virtue of passing control into the slow path.
 905 
 906   // box is really RAX -- the following CMPXCHG depends on that binding
 907   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 908   lock();
 909   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 910   // There's no successor so we tried to regrab the lock.
 911   // If that didn't work, then another thread grabbed the
 912   // lock so we're done (and exit was a success).
 913   jccb  (Assembler::notEqual, LSuccess);
 914   // Intentional fall-through into slow path
 915 
 916   bind  (LGoSlowPath);
 917   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 918   jmpb  (DONE_LABEL);
 919 
 920   bind  (LSuccess);
 921   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 922   jmpb  (DONE_LABEL);
 923 
 924 #endif
 925   if (LockingMode != LM_MONITOR) {
 926     bind  (Stacked);
 927     if (LockingMode == LM_LIGHTWEIGHT) {
 928       mov(boxReg, tmpReg);
 929       lightweight_unlock(objReg, boxReg, tmpReg, NO_COUNT);
 930       jmp(COUNT);
 931     } else if (LockingMode == LM_LEGACY) {
 932       movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 933       lock();
 934       cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 935     }
 936     // Intentional fall-thru into DONE_LABEL
 937   }
 938   bind(DONE_LABEL);
 939 
 940   // ZFlag == 1 count in fast path
 941   // ZFlag == 0 count in slow path
 942   jccb(Assembler::notZero, NO_COUNT);
 943 
 944   bind(COUNT);
 945   // Count monitors in fast path
 946 #ifndef _LP64
 947   get_thread(tmpReg);
 948   decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 949 #else // _LP64
 950   decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 951 #endif
 952 
 953   xorl(tmpReg, tmpReg); // Set ZF == 1
 954 
 955   bind(NO_COUNT);
 956 }
 957 
 958 //-------------------------------------------------------------------------------------------
 959 // Generic instructions support for use in .ad files C2 code generation
 960 
 961 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 962   if (dst != src) {
 963     movdqu(dst, src);
 964   }
 965   if (opcode == Op_AbsVD) {
 966     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 967   } else {
 968     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 969     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 970   }
 971 }
 972 
 973 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 974   if (opcode == Op_AbsVD) {
 975     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 976   } else {
 977     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 978     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 979   }
 980 }
 981 
 982 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 983   if (dst != src) {
 984     movdqu(dst, src);
 985   }
 986   if (opcode == Op_AbsVF) {
 987     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 988   } else {
 989     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 990     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 991   }
 992 }
 993 
 994 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 995   if (opcode == Op_AbsVF) {
 996     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 997   } else {
 998     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 999     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
1000   }
1001 }
1002 
1003 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
1004   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1005   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
1006 
1007   if (opcode == Op_MinV) {
1008     if (elem_bt == T_BYTE) {
1009       pminsb(dst, src);
1010     } else if (elem_bt == T_SHORT) {
1011       pminsw(dst, src);
1012     } else if (elem_bt == T_INT) {
1013       pminsd(dst, src);
1014     } else {
1015       assert(elem_bt == T_LONG, "required");
1016       assert(tmp == xmm0, "required");
1017       assert_different_registers(dst, src, tmp);
1018       movdqu(xmm0, dst);
1019       pcmpgtq(xmm0, src);
1020       blendvpd(dst, src);  // xmm0 as mask
1021     }
1022   } else { // opcode == Op_MaxV
1023     if (elem_bt == T_BYTE) {
1024       pmaxsb(dst, src);
1025     } else if (elem_bt == T_SHORT) {
1026       pmaxsw(dst, src);
1027     } else if (elem_bt == T_INT) {
1028       pmaxsd(dst, src);
1029     } else {
1030       assert(elem_bt == T_LONG, "required");
1031       assert(tmp == xmm0, "required");
1032       assert_different_registers(dst, src, tmp);
1033       movdqu(xmm0, src);
1034       pcmpgtq(xmm0, dst);
1035       blendvpd(dst, src);  // xmm0 as mask
1036     }
1037   }
1038 }
1039 
1040 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1041                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1042                                  int vlen_enc) {
1043   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1044 
1045   if (opcode == Op_MinV) {
1046     if (elem_bt == T_BYTE) {
1047       vpminsb(dst, src1, src2, vlen_enc);
1048     } else if (elem_bt == T_SHORT) {
1049       vpminsw(dst, src1, src2, vlen_enc);
1050     } else if (elem_bt == T_INT) {
1051       vpminsd(dst, src1, src2, vlen_enc);
1052     } else {
1053       assert(elem_bt == T_LONG, "required");
1054       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1055         vpminsq(dst, src1, src2, vlen_enc);
1056       } else {
1057         assert_different_registers(dst, src1, src2);
1058         vpcmpgtq(dst, src1, src2, vlen_enc);
1059         vblendvpd(dst, src1, src2, dst, vlen_enc);
1060       }
1061     }
1062   } else { // opcode == Op_MaxV
1063     if (elem_bt == T_BYTE) {
1064       vpmaxsb(dst, src1, src2, vlen_enc);
1065     } else if (elem_bt == T_SHORT) {
1066       vpmaxsw(dst, src1, src2, vlen_enc);
1067     } else if (elem_bt == T_INT) {
1068       vpmaxsd(dst, src1, src2, vlen_enc);
1069     } else {
1070       assert(elem_bt == T_LONG, "required");
1071       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1072         vpmaxsq(dst, src1, src2, vlen_enc);
1073       } else {
1074         assert_different_registers(dst, src1, src2);
1075         vpcmpgtq(dst, src1, src2, vlen_enc);
1076         vblendvpd(dst, src2, src1, dst, vlen_enc);
1077       }
1078     }
1079   }
1080 }
1081 
1082 // Float/Double min max
1083 
1084 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1085                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1086                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1087                                    int vlen_enc) {
1088   assert(UseAVX > 0, "required");
1089   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1090          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1091   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1092   assert_different_registers(a, b, tmp, atmp, btmp);
1093 
1094   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1095   bool is_double_word = is_double_word_type(elem_bt);
1096 
1097   /* Note on 'non-obvious' assembly sequence:
1098    *
1099    * While there are vminps/vmaxps instructions, there are two important differences between hardware
1100    * and Java on how they handle floats:
1101    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
1102    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
1103    *
1104    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
1105    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
1106    *                (only useful when signs differ, noop otherwise)
1107    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
1108 
1109    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
1110    *   btmp = (b < +0.0) ? a : b
1111    *   atmp = (b < +0.0) ? b : a
1112    *   Tmp  = Max_Float(atmp , btmp)
1113    *   Res  = (atmp == NaN) ? atmp : Tmp
1114    */
1115 
1116   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
1117   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
1118   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
1119   XMMRegister mask;
1120 
1121   if (!is_double_word && is_min) {
1122     mask = a;
1123     vblend = &MacroAssembler::vblendvps;
1124     vmaxmin = &MacroAssembler::vminps;
1125     vcmp = &MacroAssembler::vcmpps;
1126   } else if (!is_double_word && !is_min) {
1127     mask = b;
1128     vblend = &MacroAssembler::vblendvps;
1129     vmaxmin = &MacroAssembler::vmaxps;
1130     vcmp = &MacroAssembler::vcmpps;
1131   } else if (is_double_word && is_min) {
1132     mask = a;
1133     vblend = &MacroAssembler::vblendvpd;
1134     vmaxmin = &MacroAssembler::vminpd;
1135     vcmp = &MacroAssembler::vcmppd;
1136   } else {
1137     assert(is_double_word && !is_min, "sanity");
1138     mask = b;
1139     vblend = &MacroAssembler::vblendvpd;
1140     vmaxmin = &MacroAssembler::vmaxpd;
1141     vcmp = &MacroAssembler::vcmppd;
1142   }
1143 
1144   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1145   XMMRegister maxmin, scratch;
1146   if (dst == btmp) {
1147     maxmin = btmp;
1148     scratch = tmp;
1149   } else {
1150     maxmin = tmp;
1151     scratch = btmp;
1152   }
1153 
1154   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1155   if (precompute_mask && !is_double_word) {
1156     vpsrad(tmp, mask, 32, vlen_enc);
1157     mask = tmp;
1158   } else if (precompute_mask && is_double_word) {
1159     vpxor(tmp, tmp, tmp, vlen_enc);
1160     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1161     mask = tmp;
1162   }
1163 
1164   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1165   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1166   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1167   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1168   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1169 }
1170 
1171 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1172                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1173                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1174                                     int vlen_enc) {
1175   assert(UseAVX > 2, "required");
1176   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1177          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1178   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1179   assert_different_registers(dst, a, b, atmp, btmp);
1180 
1181   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1182   bool is_double_word = is_double_word_type(elem_bt);
1183   bool merge = true;
1184 
1185   if (!is_double_word && is_min) {
1186     evpmovd2m(ktmp, a, vlen_enc);
1187     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1188     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1189     vminps(dst, atmp, btmp, vlen_enc);
1190     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1191     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1192   } else if (!is_double_word && !is_min) {
1193     evpmovd2m(ktmp, b, vlen_enc);
1194     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1195     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1196     vmaxps(dst, atmp, btmp, vlen_enc);
1197     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1198     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1199   } else if (is_double_word && is_min) {
1200     evpmovq2m(ktmp, a, vlen_enc);
1201     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1202     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1203     vminpd(dst, atmp, btmp, vlen_enc);
1204     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1205     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1206   } else {
1207     assert(is_double_word && !is_min, "sanity");
1208     evpmovq2m(ktmp, b, vlen_enc);
1209     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1210     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1211     vmaxpd(dst, atmp, btmp, vlen_enc);
1212     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1213     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1214   }
1215 }
1216 
1217 // Float/Double signum
1218 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1219   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1220 
1221   Label DONE_LABEL;
1222 
1223   if (opcode == Op_SignumF) {
1224     assert(UseSSE > 0, "required");
1225     ucomiss(dst, zero);
1226     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1227     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1228     movflt(dst, one);
1229     jcc(Assembler::above, DONE_LABEL);
1230     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1231   } else if (opcode == Op_SignumD) {
1232     assert(UseSSE > 1, "required");
1233     ucomisd(dst, zero);
1234     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1235     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1236     movdbl(dst, one);
1237     jcc(Assembler::above, DONE_LABEL);
1238     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1239   }
1240 
1241   bind(DONE_LABEL);
1242 }
1243 
1244 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1245   if (sign) {
1246     pmovsxbw(dst, src);
1247   } else {
1248     pmovzxbw(dst, src);
1249   }
1250 }
1251 
1252 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1253   if (sign) {
1254     vpmovsxbw(dst, src, vector_len);
1255   } else {
1256     vpmovzxbw(dst, src, vector_len);
1257   }
1258 }
1259 
1260 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1261   if (sign) {
1262     vpmovsxbd(dst, src, vector_len);
1263   } else {
1264     vpmovzxbd(dst, src, vector_len);
1265   }
1266 }
1267 
1268 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1269   if (sign) {
1270     vpmovsxwd(dst, src, vector_len);
1271   } else {
1272     vpmovzxwd(dst, src, vector_len);
1273   }
1274 }
1275 
1276 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1277                                      int shift, int vector_len) {
1278   if (opcode == Op_RotateLeftV) {
1279     if (etype == T_INT) {
1280       evprold(dst, src, shift, vector_len);
1281     } else {
1282       assert(etype == T_LONG, "expected type T_LONG");
1283       evprolq(dst, src, shift, vector_len);
1284     }
1285   } else {
1286     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1287     if (etype == T_INT) {
1288       evprord(dst, src, shift, vector_len);
1289     } else {
1290       assert(etype == T_LONG, "expected type T_LONG");
1291       evprorq(dst, src, shift, vector_len);
1292     }
1293   }
1294 }
1295 
1296 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1297                                      XMMRegister shift, int vector_len) {
1298   if (opcode == Op_RotateLeftV) {
1299     if (etype == T_INT) {
1300       evprolvd(dst, src, shift, vector_len);
1301     } else {
1302       assert(etype == T_LONG, "expected type T_LONG");
1303       evprolvq(dst, src, shift, vector_len);
1304     }
1305   } else {
1306     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1307     if (etype == T_INT) {
1308       evprorvd(dst, src, shift, vector_len);
1309     } else {
1310       assert(etype == T_LONG, "expected type T_LONG");
1311       evprorvq(dst, src, shift, vector_len);
1312     }
1313   }
1314 }
1315 
1316 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1317   if (opcode == Op_RShiftVI) {
1318     psrad(dst, shift);
1319   } else if (opcode == Op_LShiftVI) {
1320     pslld(dst, shift);
1321   } else {
1322     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1323     psrld(dst, shift);
1324   }
1325 }
1326 
1327 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1328   switch (opcode) {
1329     case Op_RShiftVI:  psrad(dst, shift); break;
1330     case Op_LShiftVI:  pslld(dst, shift); break;
1331     case Op_URShiftVI: psrld(dst, shift); break;
1332 
1333     default: assert(false, "%s", NodeClassNames[opcode]);
1334   }
1335 }
1336 
1337 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1338   if (opcode == Op_RShiftVI) {
1339     vpsrad(dst, nds, shift, vector_len);
1340   } else if (opcode == Op_LShiftVI) {
1341     vpslld(dst, nds, shift, vector_len);
1342   } else {
1343     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1344     vpsrld(dst, nds, shift, vector_len);
1345   }
1346 }
1347 
1348 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1349   switch (opcode) {
1350     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1351     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1352     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1353 
1354     default: assert(false, "%s", NodeClassNames[opcode]);
1355   }
1356 }
1357 
1358 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1359   switch (opcode) {
1360     case Op_RShiftVB:  // fall-through
1361     case Op_RShiftVS:  psraw(dst, shift); break;
1362 
1363     case Op_LShiftVB:  // fall-through
1364     case Op_LShiftVS:  psllw(dst, shift);   break;
1365 
1366     case Op_URShiftVS: // fall-through
1367     case Op_URShiftVB: psrlw(dst, shift);  break;
1368 
1369     default: assert(false, "%s", NodeClassNames[opcode]);
1370   }
1371 }
1372 
1373 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1374   switch (opcode) {
1375     case Op_RShiftVB:  // fall-through
1376     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1377 
1378     case Op_LShiftVB:  // fall-through
1379     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1380 
1381     case Op_URShiftVS: // fall-through
1382     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1383 
1384     default: assert(false, "%s", NodeClassNames[opcode]);
1385   }
1386 }
1387 
1388 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1389   switch (opcode) {
1390     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1391     case Op_LShiftVL:  psllq(dst, shift); break;
1392     case Op_URShiftVL: psrlq(dst, shift); break;
1393 
1394     default: assert(false, "%s", NodeClassNames[opcode]);
1395   }
1396 }
1397 
1398 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1399   if (opcode == Op_RShiftVL) {
1400     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1401   } else if (opcode == Op_LShiftVL) {
1402     psllq(dst, shift);
1403   } else {
1404     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1405     psrlq(dst, shift);
1406   }
1407 }
1408 
1409 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1410   switch (opcode) {
1411     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1412     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1413     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1414 
1415     default: assert(false, "%s", NodeClassNames[opcode]);
1416   }
1417 }
1418 
1419 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1420   if (opcode == Op_RShiftVL) {
1421     evpsraq(dst, nds, shift, vector_len);
1422   } else if (opcode == Op_LShiftVL) {
1423     vpsllq(dst, nds, shift, vector_len);
1424   } else {
1425     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1426     vpsrlq(dst, nds, shift, vector_len);
1427   }
1428 }
1429 
1430 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1431   switch (opcode) {
1432     case Op_RShiftVB:  // fall-through
1433     case Op_RShiftVS:  // fall-through
1434     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1435 
1436     case Op_LShiftVB:  // fall-through
1437     case Op_LShiftVS:  // fall-through
1438     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1439 
1440     case Op_URShiftVB: // fall-through
1441     case Op_URShiftVS: // fall-through
1442     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1443 
1444     default: assert(false, "%s", NodeClassNames[opcode]);
1445   }
1446 }
1447 
1448 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1449   switch (opcode) {
1450     case Op_RShiftVB:  // fall-through
1451     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1452 
1453     case Op_LShiftVB:  // fall-through
1454     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1455 
1456     case Op_URShiftVB: // fall-through
1457     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1458 
1459     default: assert(false, "%s", NodeClassNames[opcode]);
1460   }
1461 }
1462 
1463 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1464   assert(UseAVX >= 2, "required");
1465   switch (opcode) {
1466     case Op_RShiftVL: {
1467       if (UseAVX > 2) {
1468         assert(tmp == xnoreg, "not used");
1469         if (!VM_Version::supports_avx512vl()) {
1470           vlen_enc = Assembler::AVX_512bit;
1471         }
1472         evpsravq(dst, src, shift, vlen_enc);
1473       } else {
1474         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1475         vpsrlvq(dst, src, shift, vlen_enc);
1476         vpsrlvq(tmp, tmp, shift, vlen_enc);
1477         vpxor(dst, dst, tmp, vlen_enc);
1478         vpsubq(dst, dst, tmp, vlen_enc);
1479       }
1480       break;
1481     }
1482     case Op_LShiftVL: {
1483       assert(tmp == xnoreg, "not used");
1484       vpsllvq(dst, src, shift, vlen_enc);
1485       break;
1486     }
1487     case Op_URShiftVL: {
1488       assert(tmp == xnoreg, "not used");
1489       vpsrlvq(dst, src, shift, vlen_enc);
1490       break;
1491     }
1492     default: assert(false, "%s", NodeClassNames[opcode]);
1493   }
1494 }
1495 
1496 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1497 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1498   assert(opcode == Op_LShiftVB ||
1499          opcode == Op_RShiftVB ||
1500          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1501   bool sign = (opcode != Op_URShiftVB);
1502   assert(vector_len == 0, "required");
1503   vextendbd(sign, dst, src, 1);
1504   vpmovzxbd(vtmp, shift, 1);
1505   varshiftd(opcode, dst, dst, vtmp, 1);
1506   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1507   vextracti128_high(vtmp, dst);
1508   vpackusdw(dst, dst, vtmp, 0);
1509 }
1510 
1511 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1512 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1513   assert(opcode == Op_LShiftVB ||
1514          opcode == Op_RShiftVB ||
1515          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1516   bool sign = (opcode != Op_URShiftVB);
1517   int ext_vector_len = vector_len + 1;
1518   vextendbw(sign, dst, src, ext_vector_len);
1519   vpmovzxbw(vtmp, shift, ext_vector_len);
1520   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1521   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1522   if (vector_len == 0) {
1523     vextracti128_high(vtmp, dst);
1524     vpackuswb(dst, dst, vtmp, vector_len);
1525   } else {
1526     vextracti64x4_high(vtmp, dst);
1527     vpackuswb(dst, dst, vtmp, vector_len);
1528     vpermq(dst, dst, 0xD8, vector_len);
1529   }
1530 }
1531 
1532 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1533   switch(typ) {
1534     case T_BYTE:
1535       pinsrb(dst, val, idx);
1536       break;
1537     case T_SHORT:
1538       pinsrw(dst, val, idx);
1539       break;
1540     case T_INT:
1541       pinsrd(dst, val, idx);
1542       break;
1543     case T_LONG:
1544       pinsrq(dst, val, idx);
1545       break;
1546     default:
1547       assert(false,"Should not reach here.");
1548       break;
1549   }
1550 }
1551 
1552 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1553   switch(typ) {
1554     case T_BYTE:
1555       vpinsrb(dst, src, val, idx);
1556       break;
1557     case T_SHORT:
1558       vpinsrw(dst, src, val, idx);
1559       break;
1560     case T_INT:
1561       vpinsrd(dst, src, val, idx);
1562       break;
1563     case T_LONG:
1564       vpinsrq(dst, src, val, idx);
1565       break;
1566     default:
1567       assert(false,"Should not reach here.");
1568       break;
1569   }
1570 }
1571 
1572 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1573   switch(typ) {
1574     case T_INT:
1575       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1576       break;
1577     case T_FLOAT:
1578       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1579       break;
1580     case T_LONG:
1581       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1582       break;
1583     case T_DOUBLE:
1584       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1585       break;
1586     default:
1587       assert(false,"Should not reach here.");
1588       break;
1589   }
1590 }
1591 
1592 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1593   switch(typ) {
1594     case T_INT:
1595       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1596       break;
1597     case T_FLOAT:
1598       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1599       break;
1600     case T_LONG:
1601       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1602       break;
1603     case T_DOUBLE:
1604       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1605       break;
1606     default:
1607       assert(false,"Should not reach here.");
1608       break;
1609   }
1610 }
1611 
1612 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1613   switch(typ) {
1614     case T_INT:
1615       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1616       break;
1617     case T_FLOAT:
1618       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1619       break;
1620     case T_LONG:
1621       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1622       break;
1623     case T_DOUBLE:
1624       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1625       break;
1626     default:
1627       assert(false,"Should not reach here.");
1628       break;
1629   }
1630 }
1631 
1632 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1633   if (vlen_in_bytes <= 16) {
1634     pxor (dst, dst);
1635     psubb(dst, src);
1636     switch (elem_bt) {
1637       case T_BYTE:   /* nothing to do */ break;
1638       case T_SHORT:  pmovsxbw(dst, dst); break;
1639       case T_INT:    pmovsxbd(dst, dst); break;
1640       case T_FLOAT:  pmovsxbd(dst, dst); break;
1641       case T_LONG:   pmovsxbq(dst, dst); break;
1642       case T_DOUBLE: pmovsxbq(dst, dst); break;
1643 
1644       default: assert(false, "%s", type2name(elem_bt));
1645     }
1646   } else {
1647     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1648     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1649 
1650     vpxor (dst, dst, dst, vlen_enc);
1651     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1652 
1653     switch (elem_bt) {
1654       case T_BYTE:   /* nothing to do */            break;
1655       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1656       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1657       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1658       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1659       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1660 
1661       default: assert(false, "%s", type2name(elem_bt));
1662     }
1663   }
1664 }
1665 
1666 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1667   if (novlbwdq) {
1668     vpmovsxbd(xtmp, src, vlen_enc);
1669     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1670             Assembler::eq, true, vlen_enc, noreg);
1671   } else {
1672     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1673     vpsubb(xtmp, xtmp, src, vlen_enc);
1674     evpmovb2m(dst, xtmp, vlen_enc);
1675   }
1676 }
1677 
1678 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
1679   switch (vlen_in_bytes) {
1680     case 4:  movdl(dst, src);   break;
1681     case 8:  movq(dst, src);    break;
1682     case 16: movdqu(dst, src);  break;
1683     case 32: vmovdqu(dst, src); break;
1684     case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1685     default: ShouldNotReachHere();
1686   }
1687 }
1688 
1689 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1690   assert(rscratch != noreg || always_reachable(src), "missing");
1691 
1692   if (reachable(src)) {
1693     load_vector(dst, as_Address(src), vlen_in_bytes);
1694   } else {
1695     lea(rscratch, src);
1696     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
1697   }
1698 }
1699 
1700 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1701   int vlen_enc = vector_length_encoding(vlen);
1702   if (VM_Version::supports_avx()) {
1703     if (bt == T_LONG) {
1704       if (VM_Version::supports_avx2()) {
1705         vpbroadcastq(dst, src, vlen_enc);
1706       } else {
1707         vmovddup(dst, src, vlen_enc);
1708       }
1709     } else if (bt == T_DOUBLE) {
1710       if (vlen_enc != Assembler::AVX_128bit) {
1711         vbroadcastsd(dst, src, vlen_enc, noreg);
1712       } else {
1713         vmovddup(dst, src, vlen_enc);
1714       }
1715     } else {
1716       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1717         vpbroadcastd(dst, src, vlen_enc);
1718       } else {
1719         vbroadcastss(dst, src, vlen_enc);
1720       }
1721     }
1722   } else if (VM_Version::supports_sse3()) {
1723     movddup(dst, src);
1724   } else {
1725     movq(dst, src);
1726     if (vlen == 16) {
1727       punpcklqdq(dst, dst);
1728     }
1729   }
1730 }
1731 
1732 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1733   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1734   int offset = exact_log2(type2aelembytes(bt)) << 6;
1735   if (is_floating_point_type(bt)) {
1736     offset += 128;
1737   }
1738   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1739   load_vector(dst, addr, vlen_in_bytes);
1740 }
1741 
1742 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1743 
1744 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1745   int vector_len = Assembler::AVX_128bit;
1746 
1747   switch (opcode) {
1748     case Op_AndReductionV:  pand(dst, src); break;
1749     case Op_OrReductionV:   por (dst, src); break;
1750     case Op_XorReductionV:  pxor(dst, src); break;
1751     case Op_MinReductionV:
1752       switch (typ) {
1753         case T_BYTE:        pminsb(dst, src); break;
1754         case T_SHORT:       pminsw(dst, src); break;
1755         case T_INT:         pminsd(dst, src); break;
1756         case T_LONG:        assert(UseAVX > 2, "required");
1757                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1758         default:            assert(false, "wrong type");
1759       }
1760       break;
1761     case Op_MaxReductionV:
1762       switch (typ) {
1763         case T_BYTE:        pmaxsb(dst, src); break;
1764         case T_SHORT:       pmaxsw(dst, src); break;
1765         case T_INT:         pmaxsd(dst, src); break;
1766         case T_LONG:        assert(UseAVX > 2, "required");
1767                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1768         default:            assert(false, "wrong type");
1769       }
1770       break;
1771     case Op_AddReductionVF: addss(dst, src); break;
1772     case Op_AddReductionVD: addsd(dst, src); break;
1773     case Op_AddReductionVI:
1774       switch (typ) {
1775         case T_BYTE:        paddb(dst, src); break;
1776         case T_SHORT:       paddw(dst, src); break;
1777         case T_INT:         paddd(dst, src); break;
1778         default:            assert(false, "wrong type");
1779       }
1780       break;
1781     case Op_AddReductionVL: paddq(dst, src); break;
1782     case Op_MulReductionVF: mulss(dst, src); break;
1783     case Op_MulReductionVD: mulsd(dst, src); break;
1784     case Op_MulReductionVI:
1785       switch (typ) {
1786         case T_SHORT:       pmullw(dst, src); break;
1787         case T_INT:         pmulld(dst, src); break;
1788         default:            assert(false, "wrong type");
1789       }
1790       break;
1791     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1792                             evpmullq(dst, dst, src, vector_len); break;
1793     default:                assert(false, "wrong opcode");
1794   }
1795 }
1796 
1797 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1798   int vector_len = Assembler::AVX_256bit;
1799 
1800   switch (opcode) {
1801     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1802     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1803     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1804     case Op_MinReductionV:
1805       switch (typ) {
1806         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1807         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1808         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1809         case T_LONG:        assert(UseAVX > 2, "required");
1810                             vpminsq(dst, src1, src2, vector_len); break;
1811         default:            assert(false, "wrong type");
1812       }
1813       break;
1814     case Op_MaxReductionV:
1815       switch (typ) {
1816         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1817         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1818         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1819         case T_LONG:        assert(UseAVX > 2, "required");
1820                             vpmaxsq(dst, src1, src2, vector_len); break;
1821         default:            assert(false, "wrong type");
1822       }
1823       break;
1824     case Op_AddReductionVI:
1825       switch (typ) {
1826         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1827         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1828         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1829         default:            assert(false, "wrong type");
1830       }
1831       break;
1832     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1833     case Op_MulReductionVI:
1834       switch (typ) {
1835         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1836         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1837         default:            assert(false, "wrong type");
1838       }
1839       break;
1840     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1841     default:                assert(false, "wrong opcode");
1842   }
1843 }
1844 
1845 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1846                                   XMMRegister dst, XMMRegister src,
1847                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1848   switch (opcode) {
1849     case Op_AddReductionVF:
1850     case Op_MulReductionVF:
1851       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1852       break;
1853 
1854     case Op_AddReductionVD:
1855     case Op_MulReductionVD:
1856       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1857       break;
1858 
1859     default: assert(false, "wrong opcode");
1860   }
1861 }
1862 
1863 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1864                              Register dst, Register src1, XMMRegister src2,
1865                              XMMRegister vtmp1, XMMRegister vtmp2) {
1866   switch (vlen) {
1867     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1868     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1869     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1870     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1871 
1872     default: assert(false, "wrong vector length");
1873   }
1874 }
1875 
1876 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1877                              Register dst, Register src1, XMMRegister src2,
1878                              XMMRegister vtmp1, XMMRegister vtmp2) {
1879   switch (vlen) {
1880     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1881     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1882     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1883     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1884 
1885     default: assert(false, "wrong vector length");
1886   }
1887 }
1888 
1889 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1890                              Register dst, Register src1, XMMRegister src2,
1891                              XMMRegister vtmp1, XMMRegister vtmp2) {
1892   switch (vlen) {
1893     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1894     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1895     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1896     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1897 
1898     default: assert(false, "wrong vector length");
1899   }
1900 }
1901 
1902 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1903                              Register dst, Register src1, XMMRegister src2,
1904                              XMMRegister vtmp1, XMMRegister vtmp2) {
1905   switch (vlen) {
1906     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1907     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1908     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1909     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1910 
1911     default: assert(false, "wrong vector length");
1912   }
1913 }
1914 
1915 #ifdef _LP64
1916 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1917                              Register dst, Register src1, XMMRegister src2,
1918                              XMMRegister vtmp1, XMMRegister vtmp2) {
1919   switch (vlen) {
1920     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1921     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1922     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1923 
1924     default: assert(false, "wrong vector length");
1925   }
1926 }
1927 #endif // _LP64
1928 
1929 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1930   switch (vlen) {
1931     case 2:
1932       assert(vtmp2 == xnoreg, "");
1933       reduce2F(opcode, dst, src, vtmp1);
1934       break;
1935     case 4:
1936       assert(vtmp2 == xnoreg, "");
1937       reduce4F(opcode, dst, src, vtmp1);
1938       break;
1939     case 8:
1940       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1941       break;
1942     case 16:
1943       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1944       break;
1945     default: assert(false, "wrong vector length");
1946   }
1947 }
1948 
1949 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1950   switch (vlen) {
1951     case 2:
1952       assert(vtmp2 == xnoreg, "");
1953       reduce2D(opcode, dst, src, vtmp1);
1954       break;
1955     case 4:
1956       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1957       break;
1958     case 8:
1959       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1960       break;
1961     default: assert(false, "wrong vector length");
1962   }
1963 }
1964 
1965 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1966   if (opcode == Op_AddReductionVI) {
1967     if (vtmp1 != src2) {
1968       movdqu(vtmp1, src2);
1969     }
1970     phaddd(vtmp1, vtmp1);
1971   } else {
1972     pshufd(vtmp1, src2, 0x1);
1973     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1974   }
1975   movdl(vtmp2, src1);
1976   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1977   movdl(dst, vtmp1);
1978 }
1979 
1980 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1981   if (opcode == Op_AddReductionVI) {
1982     if (vtmp1 != src2) {
1983       movdqu(vtmp1, src2);
1984     }
1985     phaddd(vtmp1, src2);
1986     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1987   } else {
1988     pshufd(vtmp2, src2, 0xE);
1989     reduce_operation_128(T_INT, opcode, vtmp2, src2);
1990     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1991   }
1992 }
1993 
1994 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1995   if (opcode == Op_AddReductionVI) {
1996     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1997     vextracti128_high(vtmp2, vtmp1);
1998     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1999     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2000   } else {
2001     vextracti128_high(vtmp1, src2);
2002     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2003     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2004   }
2005 }
2006 
2007 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2008   vextracti64x4_high(vtmp2, src2);
2009   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2010   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2011 }
2012 
2013 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2014   pshufd(vtmp2, src2, 0x1);
2015   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2016   movdqu(vtmp1, vtmp2);
2017   psrldq(vtmp1, 2);
2018   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2019   movdqu(vtmp2, vtmp1);
2020   psrldq(vtmp2, 1);
2021   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2022   movdl(vtmp2, src1);
2023   pmovsxbd(vtmp1, vtmp1);
2024   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2025   pextrb(dst, vtmp1, 0x0);
2026   movsbl(dst, dst);
2027 }
2028 
2029 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2030   pshufd(vtmp1, src2, 0xE);
2031   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2032   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2033 }
2034 
2035 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2036   vextracti128_high(vtmp2, src2);
2037   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2038   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2039 }
2040 
2041 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2042   vextracti64x4_high(vtmp1, src2);
2043   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2044   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2045 }
2046 
2047 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2048   pmovsxbw(vtmp2, src2);
2049   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2050 }
2051 
2052 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2053   if (UseAVX > 1) {
2054     int vector_len = Assembler::AVX_256bit;
2055     vpmovsxbw(vtmp1, src2, vector_len);
2056     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2057   } else {
2058     pmovsxbw(vtmp2, src2);
2059     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2060     pshufd(vtmp2, src2, 0x1);
2061     pmovsxbw(vtmp2, src2);
2062     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2063   }
2064 }
2065 
2066 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2067   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2068     int vector_len = Assembler::AVX_512bit;
2069     vpmovsxbw(vtmp1, src2, vector_len);
2070     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2071   } else {
2072     assert(UseAVX >= 2,"Should not reach here.");
2073     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2074     vextracti128_high(vtmp2, src2);
2075     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2076   }
2077 }
2078 
2079 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2080   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2081   vextracti64x4_high(vtmp2, src2);
2082   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2083 }
2084 
2085 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2086   if (opcode == Op_AddReductionVI) {
2087     if (vtmp1 != src2) {
2088       movdqu(vtmp1, src2);
2089     }
2090     phaddw(vtmp1, vtmp1);
2091     phaddw(vtmp1, vtmp1);
2092   } else {
2093     pshufd(vtmp2, src2, 0x1);
2094     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2095     movdqu(vtmp1, vtmp2);
2096     psrldq(vtmp1, 2);
2097     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2098   }
2099   movdl(vtmp2, src1);
2100   pmovsxwd(vtmp1, vtmp1);
2101   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2102   pextrw(dst, vtmp1, 0x0);
2103   movswl(dst, dst);
2104 }
2105 
2106 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2107   if (opcode == Op_AddReductionVI) {
2108     if (vtmp1 != src2) {
2109       movdqu(vtmp1, src2);
2110     }
2111     phaddw(vtmp1, src2);
2112   } else {
2113     pshufd(vtmp1, src2, 0xE);
2114     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2115   }
2116   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2117 }
2118 
2119 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2120   if (opcode == Op_AddReductionVI) {
2121     int vector_len = Assembler::AVX_256bit;
2122     vphaddw(vtmp2, src2, src2, vector_len);
2123     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2124   } else {
2125     vextracti128_high(vtmp2, src2);
2126     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2127   }
2128   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2129 }
2130 
2131 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2132   int vector_len = Assembler::AVX_256bit;
2133   vextracti64x4_high(vtmp1, src2);
2134   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2135   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2136 }
2137 
2138 #ifdef _LP64
2139 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2140   pshufd(vtmp2, src2, 0xE);
2141   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2142   movdq(vtmp1, src1);
2143   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2144   movdq(dst, vtmp1);
2145 }
2146 
2147 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2148   vextracti128_high(vtmp1, src2);
2149   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2150   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2151 }
2152 
2153 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2154   vextracti64x4_high(vtmp2, src2);
2155   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2156   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2157 }
2158 
2159 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2160   mov64(temp, -1L);
2161   bzhiq(temp, temp, len);
2162   kmovql(dst, temp);
2163 }
2164 #endif // _LP64
2165 
2166 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2167   reduce_operation_128(T_FLOAT, opcode, dst, src);
2168   pshufd(vtmp, src, 0x1);
2169   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2170 }
2171 
2172 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2173   reduce2F(opcode, dst, src, vtmp);
2174   pshufd(vtmp, src, 0x2);
2175   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2176   pshufd(vtmp, src, 0x3);
2177   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2178 }
2179 
2180 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2181   reduce4F(opcode, dst, src, vtmp2);
2182   vextractf128_high(vtmp2, src);
2183   reduce4F(opcode, dst, vtmp2, vtmp1);
2184 }
2185 
2186 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2187   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2188   vextracti64x4_high(vtmp1, src);
2189   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2190 }
2191 
2192 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2193   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2194   pshufd(vtmp, src, 0xE);
2195   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2196 }
2197 
2198 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2199   reduce2D(opcode, dst, src, vtmp2);
2200   vextractf128_high(vtmp2, src);
2201   reduce2D(opcode, dst, vtmp2, vtmp1);
2202 }
2203 
2204 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2205   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2206   vextracti64x4_high(vtmp1, src);
2207   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2208 }
2209 
2210 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2211   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2212 }
2213 
2214 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2215   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2216 }
2217 
2218 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2219                                  int vec_enc) {
2220   switch(elem_bt) {
2221     case T_INT:
2222     case T_FLOAT:
2223       vmaskmovps(dst, src, mask, vec_enc);
2224       break;
2225     case T_LONG:
2226     case T_DOUBLE:
2227       vmaskmovpd(dst, src, mask, vec_enc);
2228       break;
2229     default:
2230       fatal("Unsupported type %s", type2name(elem_bt));
2231       break;
2232   }
2233 }
2234 
2235 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2236                                  int vec_enc) {
2237   switch(elem_bt) {
2238     case T_INT:
2239     case T_FLOAT:
2240       vmaskmovps(dst, src, mask, vec_enc);
2241       break;
2242     case T_LONG:
2243     case T_DOUBLE:
2244       vmaskmovpd(dst, src, mask, vec_enc);
2245       break;
2246     default:
2247       fatal("Unsupported type %s", type2name(elem_bt));
2248       break;
2249   }
2250 }
2251 
2252 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2253                                           XMMRegister dst, XMMRegister src,
2254                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2255                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2256   const int permconst[] = {1, 14};
2257   XMMRegister wsrc = src;
2258   XMMRegister wdst = xmm_0;
2259   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2260 
2261   int vlen_enc = Assembler::AVX_128bit;
2262   if (vlen == 16) {
2263     vlen_enc = Assembler::AVX_256bit;
2264   }
2265 
2266   for (int i = log2(vlen) - 1; i >=0; i--) {
2267     if (i == 0 && !is_dst_valid) {
2268       wdst = dst;
2269     }
2270     if (i == 3) {
2271       vextracti64x4_high(wtmp, wsrc);
2272     } else if (i == 2) {
2273       vextracti128_high(wtmp, wsrc);
2274     } else { // i = [0,1]
2275       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2276     }
2277     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2278     wsrc = wdst;
2279     vlen_enc = Assembler::AVX_128bit;
2280   }
2281   if (is_dst_valid) {
2282     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2283   }
2284 }
2285 
2286 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2287                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2288                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2289   XMMRegister wsrc = src;
2290   XMMRegister wdst = xmm_0;
2291   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2292   int vlen_enc = Assembler::AVX_128bit;
2293   if (vlen == 8) {
2294     vlen_enc = Assembler::AVX_256bit;
2295   }
2296   for (int i = log2(vlen) - 1; i >=0; i--) {
2297     if (i == 0 && !is_dst_valid) {
2298       wdst = dst;
2299     }
2300     if (i == 1) {
2301       vextracti128_high(wtmp, wsrc);
2302     } else if (i == 2) {
2303       vextracti64x4_high(wtmp, wsrc);
2304     } else {
2305       assert(i == 0, "%d", i);
2306       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2307     }
2308     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2309     wsrc = wdst;
2310     vlen_enc = Assembler::AVX_128bit;
2311   }
2312   if (is_dst_valid) {
2313     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2314   }
2315 }
2316 
2317 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2318   switch (bt) {
2319     case T_BYTE:  pextrb(dst, src, idx); break;
2320     case T_SHORT: pextrw(dst, src, idx); break;
2321     case T_INT:   pextrd(dst, src, idx); break;
2322     case T_LONG:  pextrq(dst, src, idx); break;
2323 
2324     default:
2325       assert(false,"Should not reach here.");
2326       break;
2327   }
2328 }
2329 
2330 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2331   int esize =  type2aelembytes(typ);
2332   int elem_per_lane = 16/esize;
2333   int lane = elemindex / elem_per_lane;
2334   int eindex = elemindex % elem_per_lane;
2335 
2336   if (lane >= 2) {
2337     assert(UseAVX > 2, "required");
2338     vextractf32x4(dst, src, lane & 3);
2339     return dst;
2340   } else if (lane > 0) {
2341     assert(UseAVX > 0, "required");
2342     vextractf128(dst, src, lane);
2343     return dst;
2344   } else {
2345     return src;
2346   }
2347 }
2348 
2349 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2350   if (typ == T_BYTE) {
2351     movsbl(dst, dst);
2352   } else if (typ == T_SHORT) {
2353     movswl(dst, dst);
2354   }
2355 }
2356 
2357 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2358   int esize =  type2aelembytes(typ);
2359   int elem_per_lane = 16/esize;
2360   int eindex = elemindex % elem_per_lane;
2361   assert(is_integral_type(typ),"required");
2362 
2363   if (eindex == 0) {
2364     if (typ == T_LONG) {
2365       movq(dst, src);
2366     } else {
2367       movdl(dst, src);
2368       movsxl(typ, dst);
2369     }
2370   } else {
2371     extract(typ, dst, src, eindex);
2372     movsxl(typ, dst);
2373   }
2374 }
2375 
2376 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2377   int esize =  type2aelembytes(typ);
2378   int elem_per_lane = 16/esize;
2379   int eindex = elemindex % elem_per_lane;
2380   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2381 
2382   if (eindex == 0) {
2383     movq(dst, src);
2384   } else {
2385     if (typ == T_FLOAT) {
2386       if (UseAVX == 0) {
2387         movdqu(dst, src);
2388         shufps(dst, dst, eindex);
2389       } else {
2390         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2391       }
2392     } else {
2393       if (UseAVX == 0) {
2394         movdqu(dst, src);
2395         psrldq(dst, eindex*esize);
2396       } else {
2397         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2398       }
2399       movq(dst, dst);
2400     }
2401   }
2402   // Zero upper bits
2403   if (typ == T_FLOAT) {
2404     if (UseAVX == 0) {
2405       assert(vtmp != xnoreg, "required.");
2406       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2407       pand(dst, vtmp);
2408     } else {
2409       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2410     }
2411   }
2412 }
2413 
2414 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2415   switch(typ) {
2416     case T_BYTE:
2417     case T_BOOLEAN:
2418       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2419       break;
2420     case T_SHORT:
2421     case T_CHAR:
2422       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2423       break;
2424     case T_INT:
2425     case T_FLOAT:
2426       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2427       break;
2428     case T_LONG:
2429     case T_DOUBLE:
2430       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2431       break;
2432     default:
2433       assert(false,"Should not reach here.");
2434       break;
2435   }
2436 }
2437 
2438 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2439   assert(rscratch != noreg || always_reachable(src2), "missing");
2440 
2441   switch(typ) {
2442     case T_BOOLEAN:
2443     case T_BYTE:
2444       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2445       break;
2446     case T_CHAR:
2447     case T_SHORT:
2448       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2449       break;
2450     case T_INT:
2451     case T_FLOAT:
2452       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2453       break;
2454     case T_LONG:
2455     case T_DOUBLE:
2456       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2457       break;
2458     default:
2459       assert(false,"Should not reach here.");
2460       break;
2461   }
2462 }
2463 
2464 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2465   switch(typ) {
2466     case T_BYTE:
2467       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2468       break;
2469     case T_SHORT:
2470       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2471       break;
2472     case T_INT:
2473     case T_FLOAT:
2474       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2475       break;
2476     case T_LONG:
2477     case T_DOUBLE:
2478       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2479       break;
2480     default:
2481       assert(false,"Should not reach here.");
2482       break;
2483   }
2484 }
2485 
2486 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2487   assert(vlen_in_bytes <= 32, "");
2488   int esize = type2aelembytes(bt);
2489   if (vlen_in_bytes == 32) {
2490     assert(vtmp == xnoreg, "required.");
2491     if (esize >= 4) {
2492       vtestps(src1, src2, AVX_256bit);
2493     } else {
2494       vptest(src1, src2, AVX_256bit);
2495     }
2496     return;
2497   }
2498   if (vlen_in_bytes < 16) {
2499     // Duplicate the lower part to fill the whole register,
2500     // Don't need to do so for src2
2501     assert(vtmp != xnoreg, "required");
2502     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2503     pshufd(vtmp, src1, shuffle_imm);
2504   } else {
2505     assert(vtmp == xnoreg, "required");
2506     vtmp = src1;
2507   }
2508   if (esize >= 4 && VM_Version::supports_avx()) {
2509     vtestps(vtmp, src2, AVX_128bit);
2510   } else {
2511     ptest(vtmp, src2);
2512   }
2513 }
2514 
2515 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2516   assert(UseAVX >= 2, "required");
2517 #ifdef ASSERT
2518   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2519   bool is_bw_supported = VM_Version::supports_avx512bw();
2520   if (is_bw && !is_bw_supported) {
2521     assert(vlen_enc != Assembler::AVX_512bit, "required");
2522     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2523            "XMM register should be 0-15");
2524   }
2525 #endif // ASSERT
2526   switch (elem_bt) {
2527     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2528     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2529     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2530     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2531     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2532     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2533     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2534   }
2535 }
2536 
2537 #ifdef _LP64
2538 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2539   assert(UseAVX >= 2, "required");
2540   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2541   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2542   if ((UseAVX > 2) &&
2543       (!is_bw || VM_Version::supports_avx512bw()) &&
2544       (!is_vl || VM_Version::supports_avx512vl())) {
2545     switch (elem_bt) {
2546       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2547       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2548       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2549       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2550       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2551     }
2552   } else {
2553     assert(vlen_enc != Assembler::AVX_512bit, "required");
2554     assert((dst->encoding() < 16),"XMM register should be 0-15");
2555     switch (elem_bt) {
2556       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2557       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2558       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2559       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2560       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2561       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2562       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2563     }
2564   }
2565 }
2566 #endif
2567 
2568 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2569   switch (to_elem_bt) {
2570     case T_SHORT:
2571       vpmovsxbw(dst, src, vlen_enc);
2572       break;
2573     case T_INT:
2574       vpmovsxbd(dst, src, vlen_enc);
2575       break;
2576     case T_FLOAT:
2577       vpmovsxbd(dst, src, vlen_enc);
2578       vcvtdq2ps(dst, dst, vlen_enc);
2579       break;
2580     case T_LONG:
2581       vpmovsxbq(dst, src, vlen_enc);
2582       break;
2583     case T_DOUBLE: {
2584       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2585       vpmovsxbd(dst, src, mid_vlen_enc);
2586       vcvtdq2pd(dst, dst, vlen_enc);
2587       break;
2588     }
2589     default:
2590       fatal("Unsupported type %s", type2name(to_elem_bt));
2591       break;
2592   }
2593 }
2594 
2595 //-------------------------------------------------------------------------------------------
2596 
2597 // IndexOf for constant substrings with size >= 8 chars
2598 // which don't need to be loaded through stack.
2599 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2600                                          Register cnt1, Register cnt2,
2601                                          int int_cnt2,  Register result,
2602                                          XMMRegister vec, Register tmp,
2603                                          int ae) {
2604   ShortBranchVerifier sbv(this);
2605   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2606   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2607 
2608   // This method uses the pcmpestri instruction with bound registers
2609   //   inputs:
2610   //     xmm - substring
2611   //     rax - substring length (elements count)
2612   //     mem - scanned string
2613   //     rdx - string length (elements count)
2614   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2615   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2616   //   outputs:
2617   //     rcx - matched index in string
2618   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2619   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2620   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2621   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2622   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2623 
2624   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2625         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2626         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2627 
2628   // Note, inline_string_indexOf() generates checks:
2629   // if (substr.count > string.count) return -1;
2630   // if (substr.count == 0) return 0;
2631   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2632 
2633   // Load substring.
2634   if (ae == StrIntrinsicNode::UL) {
2635     pmovzxbw(vec, Address(str2, 0));
2636   } else {
2637     movdqu(vec, Address(str2, 0));
2638   }
2639   movl(cnt2, int_cnt2);
2640   movptr(result, str1); // string addr
2641 
2642   if (int_cnt2 > stride) {
2643     jmpb(SCAN_TO_SUBSTR);
2644 
2645     // Reload substr for rescan, this code
2646     // is executed only for large substrings (> 8 chars)
2647     bind(RELOAD_SUBSTR);
2648     if (ae == StrIntrinsicNode::UL) {
2649       pmovzxbw(vec, Address(str2, 0));
2650     } else {
2651       movdqu(vec, Address(str2, 0));
2652     }
2653     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2654 
2655     bind(RELOAD_STR);
2656     // We came here after the beginning of the substring was
2657     // matched but the rest of it was not so we need to search
2658     // again. Start from the next element after the previous match.
2659 
2660     // cnt2 is number of substring reminding elements and
2661     // cnt1 is number of string reminding elements when cmp failed.
2662     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2663     subl(cnt1, cnt2);
2664     addl(cnt1, int_cnt2);
2665     movl(cnt2, int_cnt2); // Now restore cnt2
2666 
2667     decrementl(cnt1);     // Shift to next element
2668     cmpl(cnt1, cnt2);
2669     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2670 
2671     addptr(result, (1<<scale1));
2672 
2673   } // (int_cnt2 > 8)
2674 
2675   // Scan string for start of substr in 16-byte vectors
2676   bind(SCAN_TO_SUBSTR);
2677   pcmpestri(vec, Address(result, 0), mode);
2678   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2679   subl(cnt1, stride);
2680   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2681   cmpl(cnt1, cnt2);
2682   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2683   addptr(result, 16);
2684   jmpb(SCAN_TO_SUBSTR);
2685 
2686   // Found a potential substr
2687   bind(FOUND_CANDIDATE);
2688   // Matched whole vector if first element matched (tmp(rcx) == 0).
2689   if (int_cnt2 == stride) {
2690     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2691   } else { // int_cnt2 > 8
2692     jccb(Assembler::overflow, FOUND_SUBSTR);
2693   }
2694   // After pcmpestri tmp(rcx) contains matched element index
2695   // Compute start addr of substr
2696   lea(result, Address(result, tmp, scale1));
2697 
2698   // Make sure string is still long enough
2699   subl(cnt1, tmp);
2700   cmpl(cnt1, cnt2);
2701   if (int_cnt2 == stride) {
2702     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2703   } else { // int_cnt2 > 8
2704     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2705   }
2706   // Left less then substring.
2707 
2708   bind(RET_NOT_FOUND);
2709   movl(result, -1);
2710   jmp(EXIT);
2711 
2712   if (int_cnt2 > stride) {
2713     // This code is optimized for the case when whole substring
2714     // is matched if its head is matched.
2715     bind(MATCH_SUBSTR_HEAD);
2716     pcmpestri(vec, Address(result, 0), mode);
2717     // Reload only string if does not match
2718     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2719 
2720     Label CONT_SCAN_SUBSTR;
2721     // Compare the rest of substring (> 8 chars).
2722     bind(FOUND_SUBSTR);
2723     // First 8 chars are already matched.
2724     negptr(cnt2);
2725     addptr(cnt2, stride);
2726 
2727     bind(SCAN_SUBSTR);
2728     subl(cnt1, stride);
2729     cmpl(cnt2, -stride); // Do not read beyond substring
2730     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2731     // Back-up strings to avoid reading beyond substring:
2732     // cnt1 = cnt1 - cnt2 + 8
2733     addl(cnt1, cnt2); // cnt2 is negative
2734     addl(cnt1, stride);
2735     movl(cnt2, stride); negptr(cnt2);
2736     bind(CONT_SCAN_SUBSTR);
2737     if (int_cnt2 < (int)G) {
2738       int tail_off1 = int_cnt2<<scale1;
2739       int tail_off2 = int_cnt2<<scale2;
2740       if (ae == StrIntrinsicNode::UL) {
2741         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2742       } else {
2743         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2744       }
2745       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2746     } else {
2747       // calculate index in register to avoid integer overflow (int_cnt2*2)
2748       movl(tmp, int_cnt2);
2749       addptr(tmp, cnt2);
2750       if (ae == StrIntrinsicNode::UL) {
2751         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2752       } else {
2753         movdqu(vec, Address(str2, tmp, scale2, 0));
2754       }
2755       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2756     }
2757     // Need to reload strings pointers if not matched whole vector
2758     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2759     addptr(cnt2, stride);
2760     jcc(Assembler::negative, SCAN_SUBSTR);
2761     // Fall through if found full substring
2762 
2763   } // (int_cnt2 > 8)
2764 
2765   bind(RET_FOUND);
2766   // Found result if we matched full small substring.
2767   // Compute substr offset
2768   subptr(result, str1);
2769   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2770     shrl(result, 1); // index
2771   }
2772   bind(EXIT);
2773 
2774 } // string_indexofC8
2775 
2776 // Small strings are loaded through stack if they cross page boundary.
2777 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2778                                        Register cnt1, Register cnt2,
2779                                        int int_cnt2,  Register result,
2780                                        XMMRegister vec, Register tmp,
2781                                        int ae) {
2782   ShortBranchVerifier sbv(this);
2783   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2784   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2785 
2786   //
2787   // int_cnt2 is length of small (< 8 chars) constant substring
2788   // or (-1) for non constant substring in which case its length
2789   // is in cnt2 register.
2790   //
2791   // Note, inline_string_indexOf() generates checks:
2792   // if (substr.count > string.count) return -1;
2793   // if (substr.count == 0) return 0;
2794   //
2795   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2796   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2797   // This method uses the pcmpestri instruction with bound registers
2798   //   inputs:
2799   //     xmm - substring
2800   //     rax - substring length (elements count)
2801   //     mem - scanned string
2802   //     rdx - string length (elements count)
2803   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2804   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2805   //   outputs:
2806   //     rcx - matched index in string
2807   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2808   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2809   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2810   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2811 
2812   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2813         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2814         FOUND_CANDIDATE;
2815 
2816   { //========================================================
2817     // We don't know where these strings are located
2818     // and we can't read beyond them. Load them through stack.
2819     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2820 
2821     movptr(tmp, rsp); // save old SP
2822 
2823     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2824       if (int_cnt2 == (1>>scale2)) { // One byte
2825         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2826         load_unsigned_byte(result, Address(str2, 0));
2827         movdl(vec, result); // move 32 bits
2828       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2829         // Not enough header space in 32-bit VM: 12+3 = 15.
2830         movl(result, Address(str2, -1));
2831         shrl(result, 8);
2832         movdl(vec, result); // move 32 bits
2833       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2834         load_unsigned_short(result, Address(str2, 0));
2835         movdl(vec, result); // move 32 bits
2836       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2837         movdl(vec, Address(str2, 0)); // move 32 bits
2838       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2839         movq(vec, Address(str2, 0));  // move 64 bits
2840       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2841         // Array header size is 12 bytes in 32-bit VM
2842         // + 6 bytes for 3 chars == 18 bytes,
2843         // enough space to load vec and shift.
2844         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2845         if (ae == StrIntrinsicNode::UL) {
2846           int tail_off = int_cnt2-8;
2847           pmovzxbw(vec, Address(str2, tail_off));
2848           psrldq(vec, -2*tail_off);
2849         }
2850         else {
2851           int tail_off = int_cnt2*(1<<scale2);
2852           movdqu(vec, Address(str2, tail_off-16));
2853           psrldq(vec, 16-tail_off);
2854         }
2855       }
2856     } else { // not constant substring
2857       cmpl(cnt2, stride);
2858       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2859 
2860       // We can read beyond string if srt+16 does not cross page boundary
2861       // since heaps are aligned and mapped by pages.
2862       assert(os::vm_page_size() < (int)G, "default page should be small");
2863       movl(result, str2); // We need only low 32 bits
2864       andl(result, ((int)os::vm_page_size()-1));
2865       cmpl(result, ((int)os::vm_page_size()-16));
2866       jccb(Assembler::belowEqual, CHECK_STR);
2867 
2868       // Move small strings to stack to allow load 16 bytes into vec.
2869       subptr(rsp, 16);
2870       int stk_offset = wordSize-(1<<scale2);
2871       push(cnt2);
2872 
2873       bind(COPY_SUBSTR);
2874       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2875         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2876         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2877       } else if (ae == StrIntrinsicNode::UU) {
2878         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2879         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2880       }
2881       decrement(cnt2);
2882       jccb(Assembler::notZero, COPY_SUBSTR);
2883 
2884       pop(cnt2);
2885       movptr(str2, rsp);  // New substring address
2886     } // non constant
2887 
2888     bind(CHECK_STR);
2889     cmpl(cnt1, stride);
2890     jccb(Assembler::aboveEqual, BIG_STRINGS);
2891 
2892     // Check cross page boundary.
2893     movl(result, str1); // We need only low 32 bits
2894     andl(result, ((int)os::vm_page_size()-1));
2895     cmpl(result, ((int)os::vm_page_size()-16));
2896     jccb(Assembler::belowEqual, BIG_STRINGS);
2897 
2898     subptr(rsp, 16);
2899     int stk_offset = -(1<<scale1);
2900     if (int_cnt2 < 0) { // not constant
2901       push(cnt2);
2902       stk_offset += wordSize;
2903     }
2904     movl(cnt2, cnt1);
2905 
2906     bind(COPY_STR);
2907     if (ae == StrIntrinsicNode::LL) {
2908       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2909       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2910     } else {
2911       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2912       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2913     }
2914     decrement(cnt2);
2915     jccb(Assembler::notZero, COPY_STR);
2916 
2917     if (int_cnt2 < 0) { // not constant
2918       pop(cnt2);
2919     }
2920     movptr(str1, rsp);  // New string address
2921 
2922     bind(BIG_STRINGS);
2923     // Load substring.
2924     if (int_cnt2 < 0) { // -1
2925       if (ae == StrIntrinsicNode::UL) {
2926         pmovzxbw(vec, Address(str2, 0));
2927       } else {
2928         movdqu(vec, Address(str2, 0));
2929       }
2930       push(cnt2);       // substr count
2931       push(str2);       // substr addr
2932       push(str1);       // string addr
2933     } else {
2934       // Small (< 8 chars) constant substrings are loaded already.
2935       movl(cnt2, int_cnt2);
2936     }
2937     push(tmp);  // original SP
2938 
2939   } // Finished loading
2940 
2941   //========================================================
2942   // Start search
2943   //
2944 
2945   movptr(result, str1); // string addr
2946 
2947   if (int_cnt2  < 0) {  // Only for non constant substring
2948     jmpb(SCAN_TO_SUBSTR);
2949 
2950     // SP saved at sp+0
2951     // String saved at sp+1*wordSize
2952     // Substr saved at sp+2*wordSize
2953     // Substr count saved at sp+3*wordSize
2954 
2955     // Reload substr for rescan, this code
2956     // is executed only for large substrings (> 8 chars)
2957     bind(RELOAD_SUBSTR);
2958     movptr(str2, Address(rsp, 2*wordSize));
2959     movl(cnt2, Address(rsp, 3*wordSize));
2960     if (ae == StrIntrinsicNode::UL) {
2961       pmovzxbw(vec, Address(str2, 0));
2962     } else {
2963       movdqu(vec, Address(str2, 0));
2964     }
2965     // We came here after the beginning of the substring was
2966     // matched but the rest of it was not so we need to search
2967     // again. Start from the next element after the previous match.
2968     subptr(str1, result); // Restore counter
2969     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2970       shrl(str1, 1);
2971     }
2972     addl(cnt1, str1);
2973     decrementl(cnt1);   // Shift to next element
2974     cmpl(cnt1, cnt2);
2975     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2976 
2977     addptr(result, (1<<scale1));
2978   } // non constant
2979 
2980   // Scan string for start of substr in 16-byte vectors
2981   bind(SCAN_TO_SUBSTR);
2982   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2983   pcmpestri(vec, Address(result, 0), mode);
2984   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2985   subl(cnt1, stride);
2986   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2987   cmpl(cnt1, cnt2);
2988   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2989   addptr(result, 16);
2990 
2991   bind(ADJUST_STR);
2992   cmpl(cnt1, stride); // Do not read beyond string
2993   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2994   // Back-up string to avoid reading beyond string.
2995   lea(result, Address(result, cnt1, scale1, -16));
2996   movl(cnt1, stride);
2997   jmpb(SCAN_TO_SUBSTR);
2998 
2999   // Found a potential substr
3000   bind(FOUND_CANDIDATE);
3001   // After pcmpestri tmp(rcx) contains matched element index
3002 
3003   // Make sure string is still long enough
3004   subl(cnt1, tmp);
3005   cmpl(cnt1, cnt2);
3006   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3007   // Left less then substring.
3008 
3009   bind(RET_NOT_FOUND);
3010   movl(result, -1);
3011   jmp(CLEANUP);
3012 
3013   bind(FOUND_SUBSTR);
3014   // Compute start addr of substr
3015   lea(result, Address(result, tmp, scale1));
3016   if (int_cnt2 > 0) { // Constant substring
3017     // Repeat search for small substring (< 8 chars)
3018     // from new point without reloading substring.
3019     // Have to check that we don't read beyond string.
3020     cmpl(tmp, stride-int_cnt2);
3021     jccb(Assembler::greater, ADJUST_STR);
3022     // Fall through if matched whole substring.
3023   } else { // non constant
3024     assert(int_cnt2 == -1, "should be != 0");
3025 
3026     addl(tmp, cnt2);
3027     // Found result if we matched whole substring.
3028     cmpl(tmp, stride);
3029     jcc(Assembler::lessEqual, RET_FOUND);
3030 
3031     // Repeat search for small substring (<= 8 chars)
3032     // from new point 'str1' without reloading substring.
3033     cmpl(cnt2, stride);
3034     // Have to check that we don't read beyond string.
3035     jccb(Assembler::lessEqual, ADJUST_STR);
3036 
3037     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3038     // Compare the rest of substring (> 8 chars).
3039     movptr(str1, result);
3040 
3041     cmpl(tmp, cnt2);
3042     // First 8 chars are already matched.
3043     jccb(Assembler::equal, CHECK_NEXT);
3044 
3045     bind(SCAN_SUBSTR);
3046     pcmpestri(vec, Address(str1, 0), mode);
3047     // Need to reload strings pointers if not matched whole vector
3048     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3049 
3050     bind(CHECK_NEXT);
3051     subl(cnt2, stride);
3052     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3053     addptr(str1, 16);
3054     if (ae == StrIntrinsicNode::UL) {
3055       addptr(str2, 8);
3056     } else {
3057       addptr(str2, 16);
3058     }
3059     subl(cnt1, stride);
3060     cmpl(cnt2, stride); // Do not read beyond substring
3061     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3062     // Back-up strings to avoid reading beyond substring.
3063 
3064     if (ae == StrIntrinsicNode::UL) {
3065       lea(str2, Address(str2, cnt2, scale2, -8));
3066       lea(str1, Address(str1, cnt2, scale1, -16));
3067     } else {
3068       lea(str2, Address(str2, cnt2, scale2, -16));
3069       lea(str1, Address(str1, cnt2, scale1, -16));
3070     }
3071     subl(cnt1, cnt2);
3072     movl(cnt2, stride);
3073     addl(cnt1, stride);
3074     bind(CONT_SCAN_SUBSTR);
3075     if (ae == StrIntrinsicNode::UL) {
3076       pmovzxbw(vec, Address(str2, 0));
3077     } else {
3078       movdqu(vec, Address(str2, 0));
3079     }
3080     jmp(SCAN_SUBSTR);
3081 
3082     bind(RET_FOUND_LONG);
3083     movptr(str1, Address(rsp, wordSize));
3084   } // non constant
3085 
3086   bind(RET_FOUND);
3087   // Compute substr offset
3088   subptr(result, str1);
3089   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3090     shrl(result, 1); // index
3091   }
3092   bind(CLEANUP);
3093   pop(rsp); // restore SP
3094 
3095 } // string_indexof
3096 
3097 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3098                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3099   ShortBranchVerifier sbv(this);
3100   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3101 
3102   int stride = 8;
3103 
3104   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3105         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3106         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3107         FOUND_SEQ_CHAR, DONE_LABEL;
3108 
3109   movptr(result, str1);
3110   if (UseAVX >= 2) {
3111     cmpl(cnt1, stride);
3112     jcc(Assembler::less, SCAN_TO_CHAR);
3113     cmpl(cnt1, 2*stride);
3114     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3115     movdl(vec1, ch);
3116     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3117     vpxor(vec2, vec2);
3118     movl(tmp, cnt1);
3119     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3120     andl(cnt1,0x0000000F);  //tail count (in chars)
3121 
3122     bind(SCAN_TO_16_CHAR_LOOP);
3123     vmovdqu(vec3, Address(result, 0));
3124     vpcmpeqw(vec3, vec3, vec1, 1);
3125     vptest(vec2, vec3);
3126     jcc(Assembler::carryClear, FOUND_CHAR);
3127     addptr(result, 32);
3128     subl(tmp, 2*stride);
3129     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3130     jmp(SCAN_TO_8_CHAR);
3131     bind(SCAN_TO_8_CHAR_INIT);
3132     movdl(vec1, ch);
3133     pshuflw(vec1, vec1, 0x00);
3134     pshufd(vec1, vec1, 0);
3135     pxor(vec2, vec2);
3136   }
3137   bind(SCAN_TO_8_CHAR);
3138   cmpl(cnt1, stride);
3139   jcc(Assembler::less, SCAN_TO_CHAR);
3140   if (UseAVX < 2) {
3141     movdl(vec1, ch);
3142     pshuflw(vec1, vec1, 0x00);
3143     pshufd(vec1, vec1, 0);
3144     pxor(vec2, vec2);
3145   }
3146   movl(tmp, cnt1);
3147   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3148   andl(cnt1,0x00000007);  //tail count (in chars)
3149 
3150   bind(SCAN_TO_8_CHAR_LOOP);
3151   movdqu(vec3, Address(result, 0));
3152   pcmpeqw(vec3, vec1);
3153   ptest(vec2, vec3);
3154   jcc(Assembler::carryClear, FOUND_CHAR);
3155   addptr(result, 16);
3156   subl(tmp, stride);
3157   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3158   bind(SCAN_TO_CHAR);
3159   testl(cnt1, cnt1);
3160   jcc(Assembler::zero, RET_NOT_FOUND);
3161   bind(SCAN_TO_CHAR_LOOP);
3162   load_unsigned_short(tmp, Address(result, 0));
3163   cmpl(ch, tmp);
3164   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3165   addptr(result, 2);
3166   subl(cnt1, 1);
3167   jccb(Assembler::zero, RET_NOT_FOUND);
3168   jmp(SCAN_TO_CHAR_LOOP);
3169 
3170   bind(RET_NOT_FOUND);
3171   movl(result, -1);
3172   jmpb(DONE_LABEL);
3173 
3174   bind(FOUND_CHAR);
3175   if (UseAVX >= 2) {
3176     vpmovmskb(tmp, vec3);
3177   } else {
3178     pmovmskb(tmp, vec3);
3179   }
3180   bsfl(ch, tmp);
3181   addptr(result, ch);
3182 
3183   bind(FOUND_SEQ_CHAR);
3184   subptr(result, str1);
3185   shrl(result, 1);
3186 
3187   bind(DONE_LABEL);
3188 } // string_indexof_char
3189 
3190 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3191                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3192   ShortBranchVerifier sbv(this);
3193   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3194 
3195   int stride = 16;
3196 
3197   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3198         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3199         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3200         FOUND_SEQ_CHAR, DONE_LABEL;
3201 
3202   movptr(result, str1);
3203   if (UseAVX >= 2) {
3204     cmpl(cnt1, stride);
3205     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3206     cmpl(cnt1, stride*2);
3207     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3208     movdl(vec1, ch);
3209     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3210     vpxor(vec2, vec2);
3211     movl(tmp, cnt1);
3212     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3213     andl(cnt1,0x0000001F);  //tail count (in chars)
3214 
3215     bind(SCAN_TO_32_CHAR_LOOP);
3216     vmovdqu(vec3, Address(result, 0));
3217     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3218     vptest(vec2, vec3);
3219     jcc(Assembler::carryClear, FOUND_CHAR);
3220     addptr(result, 32);
3221     subl(tmp, stride*2);
3222     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3223     jmp(SCAN_TO_16_CHAR);
3224 
3225     bind(SCAN_TO_16_CHAR_INIT);
3226     movdl(vec1, ch);
3227     pxor(vec2, vec2);
3228     pshufb(vec1, vec2);
3229   }
3230 
3231   bind(SCAN_TO_16_CHAR);
3232   cmpl(cnt1, stride);
3233   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3234   if (UseAVX < 2) {
3235     movdl(vec1, ch);
3236     pxor(vec2, vec2);
3237     pshufb(vec1, vec2);
3238   }
3239   movl(tmp, cnt1);
3240   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3241   andl(cnt1,0x0000000F);  //tail count (in bytes)
3242 
3243   bind(SCAN_TO_16_CHAR_LOOP);
3244   movdqu(vec3, Address(result, 0));
3245   pcmpeqb(vec3, vec1);
3246   ptest(vec2, vec3);
3247   jcc(Assembler::carryClear, FOUND_CHAR);
3248   addptr(result, 16);
3249   subl(tmp, stride);
3250   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3251 
3252   bind(SCAN_TO_CHAR_INIT);
3253   testl(cnt1, cnt1);
3254   jcc(Assembler::zero, RET_NOT_FOUND);
3255   bind(SCAN_TO_CHAR_LOOP);
3256   load_unsigned_byte(tmp, Address(result, 0));
3257   cmpl(ch, tmp);
3258   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3259   addptr(result, 1);
3260   subl(cnt1, 1);
3261   jccb(Assembler::zero, RET_NOT_FOUND);
3262   jmp(SCAN_TO_CHAR_LOOP);
3263 
3264   bind(RET_NOT_FOUND);
3265   movl(result, -1);
3266   jmpb(DONE_LABEL);
3267 
3268   bind(FOUND_CHAR);
3269   if (UseAVX >= 2) {
3270     vpmovmskb(tmp, vec3);
3271   } else {
3272     pmovmskb(tmp, vec3);
3273   }
3274   bsfl(ch, tmp);
3275   addptr(result, ch);
3276 
3277   bind(FOUND_SEQ_CHAR);
3278   subptr(result, str1);
3279 
3280   bind(DONE_LABEL);
3281 } // stringL_indexof_char
3282 
3283 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3284   switch (eltype) {
3285   case T_BOOLEAN: return sizeof(jboolean);
3286   case T_BYTE:  return sizeof(jbyte);
3287   case T_SHORT: return sizeof(jshort);
3288   case T_CHAR:  return sizeof(jchar);
3289   case T_INT:   return sizeof(jint);
3290   default:
3291     ShouldNotReachHere();
3292     return -1;
3293   }
3294 }
3295 
3296 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3297   switch (eltype) {
3298   // T_BOOLEAN used as surrogate for unsigned byte
3299   case T_BOOLEAN: movzbl(dst, src);   break;
3300   case T_BYTE:    movsbl(dst, src);   break;
3301   case T_SHORT:   movswl(dst, src);   break;
3302   case T_CHAR:    movzwl(dst, src);   break;
3303   case T_INT:     movl(dst, src);     break;
3304   default:
3305     ShouldNotReachHere();
3306   }
3307 }
3308 
3309 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3310   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3311 }
3312 
3313 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3314   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3315 }
3316 
3317 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3318   const int vlen = Assembler::AVX_256bit;
3319   switch (eltype) {
3320   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3321   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3322   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3323   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3324   case T_INT:
3325     // do nothing
3326     break;
3327   default:
3328     ShouldNotReachHere();
3329   }
3330 }
3331 
3332 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3333                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3334                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3335                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3336                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3337                                         BasicType eltype) {
3338   ShortBranchVerifier sbv(this);
3339   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3340   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3341   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3342 
3343   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3344         SHORT_UNROLLED_LOOP_EXIT,
3345         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3346         UNROLLED_VECTOR_LOOP_BEGIN,
3347         END;
3348   switch (eltype) {
3349   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3350   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3351   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3352   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3353   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3354   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3355   }
3356 
3357   // For "renaming" for readibility of the code
3358   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3359                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3360                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3361 
3362   const int elsize = arrays_hashcode_elsize(eltype);
3363 
3364   /*
3365     if (cnt1 >= 2) {
3366       if (cnt1 >= 32) {
3367         UNROLLED VECTOR LOOP
3368       }
3369       UNROLLED SCALAR LOOP
3370     }
3371     SINGLE SCALAR
3372    */
3373 
3374   cmpl(cnt1, 32);
3375   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3376 
3377   // cnt1 >= 32 && generate_vectorized_loop
3378   xorl(index, index);
3379 
3380   // vresult = IntVector.zero(I256);
3381   for (int idx = 0; idx < 4; idx++) {
3382     vpxor(vresult[idx], vresult[idx]);
3383   }
3384   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3385   Register bound = tmp2;
3386   Register next = tmp3;
3387   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3388   movl(next, Address(tmp2, 0));
3389   movdl(vnext, next);
3390   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3391 
3392   // index = 0;
3393   // bound = cnt1 & ~(32 - 1);
3394   movl(bound, cnt1);
3395   andl(bound, ~(32 - 1));
3396   // for (; index < bound; index += 32) {
3397   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3398   // result *= next;
3399   imull(result, next);
3400   // loop fission to upfront the cost of fetching from memory, OOO execution
3401   // can then hopefully do a better job of prefetching
3402   for (int idx = 0; idx < 4; idx++) {
3403     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3404   }
3405   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3406   for (int idx = 0; idx < 4; idx++) {
3407     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3408     arrays_hashcode_elvcast(vtmp[idx], eltype);
3409     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3410   }
3411   // index += 32;
3412   addl(index, 32);
3413   // index < bound;
3414   cmpl(index, bound);
3415   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3416   // }
3417 
3418   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3419   subl(cnt1, bound);
3420   // release bound
3421 
3422   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3423   for (int idx = 0; idx < 4; idx++) {
3424     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3425     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3426     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3427   }
3428   // result += vresult.reduceLanes(ADD);
3429   for (int idx = 0; idx < 4; idx++) {
3430     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3431   }
3432 
3433   // } else if (cnt1 < 32) {
3434 
3435   bind(SHORT_UNROLLED_BEGIN);
3436   // int i = 1;
3437   movl(index, 1);
3438   cmpl(index, cnt1);
3439   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3440 
3441   // for (; i < cnt1 ; i += 2) {
3442   bind(SHORT_UNROLLED_LOOP_BEGIN);
3443   movl(tmp3, 961);
3444   imull(result, tmp3);
3445   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3446   movl(tmp3, tmp2);
3447   shll(tmp3, 5);
3448   subl(tmp3, tmp2);
3449   addl(result, tmp3);
3450   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3451   addl(result, tmp3);
3452   addl(index, 2);
3453   cmpl(index, cnt1);
3454   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3455 
3456   // }
3457   // if (i >= cnt1) {
3458   bind(SHORT_UNROLLED_LOOP_EXIT);
3459   jccb(Assembler::greater, END);
3460   movl(tmp2, result);
3461   shll(result, 5);
3462   subl(result, tmp2);
3463   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3464   addl(result, tmp3);
3465   // }
3466   bind(END);
3467 
3468   BLOCK_COMMENT("} // arrays_hashcode");
3469 
3470 } // arrays_hashcode
3471 
3472 // helper function for string_compare
3473 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3474                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3475                                            Address::ScaleFactor scale2, Register index, int ae) {
3476   if (ae == StrIntrinsicNode::LL) {
3477     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3478     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3479   } else if (ae == StrIntrinsicNode::UU) {
3480     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3481     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3482   } else {
3483     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3484     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3485   }
3486 }
3487 
3488 // Compare strings, used for char[] and byte[].
3489 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3490                                        Register cnt1, Register cnt2, Register result,
3491                                        XMMRegister vec1, int ae, KRegister mask) {
3492   ShortBranchVerifier sbv(this);
3493   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3494   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3495   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3496   int stride2x2 = 0x40;
3497   Address::ScaleFactor scale = Address::no_scale;
3498   Address::ScaleFactor scale1 = Address::no_scale;
3499   Address::ScaleFactor scale2 = Address::no_scale;
3500 
3501   if (ae != StrIntrinsicNode::LL) {
3502     stride2x2 = 0x20;
3503   }
3504 
3505   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3506     shrl(cnt2, 1);
3507   }
3508   // Compute the minimum of the string lengths and the
3509   // difference of the string lengths (stack).
3510   // Do the conditional move stuff
3511   movl(result, cnt1);
3512   subl(cnt1, cnt2);
3513   push(cnt1);
3514   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3515 
3516   // Is the minimum length zero?
3517   testl(cnt2, cnt2);
3518   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3519   if (ae == StrIntrinsicNode::LL) {
3520     // Load first bytes
3521     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3522     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3523   } else if (ae == StrIntrinsicNode::UU) {
3524     // Load first characters
3525     load_unsigned_short(result, Address(str1, 0));
3526     load_unsigned_short(cnt1, Address(str2, 0));
3527   } else {
3528     load_unsigned_byte(result, Address(str1, 0));
3529     load_unsigned_short(cnt1, Address(str2, 0));
3530   }
3531   subl(result, cnt1);
3532   jcc(Assembler::notZero,  POP_LABEL);
3533 
3534   if (ae == StrIntrinsicNode::UU) {
3535     // Divide length by 2 to get number of chars
3536     shrl(cnt2, 1);
3537   }
3538   cmpl(cnt2, 1);
3539   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3540 
3541   // Check if the strings start at the same location and setup scale and stride
3542   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3543     cmpptr(str1, str2);
3544     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3545     if (ae == StrIntrinsicNode::LL) {
3546       scale = Address::times_1;
3547       stride = 16;
3548     } else {
3549       scale = Address::times_2;
3550       stride = 8;
3551     }
3552   } else {
3553     scale1 = Address::times_1;
3554     scale2 = Address::times_2;
3555     // scale not used
3556     stride = 8;
3557   }
3558 
3559   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3560     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3561     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3562     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3563     Label COMPARE_TAIL_LONG;
3564     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3565 
3566     int pcmpmask = 0x19;
3567     if (ae == StrIntrinsicNode::LL) {
3568       pcmpmask &= ~0x01;
3569     }
3570 
3571     // Setup to compare 16-chars (32-bytes) vectors,
3572     // start from first character again because it has aligned address.
3573     if (ae == StrIntrinsicNode::LL) {
3574       stride2 = 32;
3575     } else {
3576       stride2 = 16;
3577     }
3578     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3579       adr_stride = stride << scale;
3580     } else {
3581       adr_stride1 = 8;  //stride << scale1;
3582       adr_stride2 = 16; //stride << scale2;
3583     }
3584 
3585     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3586     // rax and rdx are used by pcmpestri as elements counters
3587     movl(result, cnt2);
3588     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3589     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3590 
3591     // fast path : compare first 2 8-char vectors.
3592     bind(COMPARE_16_CHARS);
3593     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3594       movdqu(vec1, Address(str1, 0));
3595     } else {
3596       pmovzxbw(vec1, Address(str1, 0));
3597     }
3598     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3599     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3600 
3601     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3602       movdqu(vec1, Address(str1, adr_stride));
3603       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3604     } else {
3605       pmovzxbw(vec1, Address(str1, adr_stride1));
3606       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3607     }
3608     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3609     addl(cnt1, stride);
3610 
3611     // Compare the characters at index in cnt1
3612     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3613     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3614     subl(result, cnt2);
3615     jmp(POP_LABEL);
3616 
3617     // Setup the registers to start vector comparison loop
3618     bind(COMPARE_WIDE_VECTORS);
3619     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3620       lea(str1, Address(str1, result, scale));
3621       lea(str2, Address(str2, result, scale));
3622     } else {
3623       lea(str1, Address(str1, result, scale1));
3624       lea(str2, Address(str2, result, scale2));
3625     }
3626     subl(result, stride2);
3627     subl(cnt2, stride2);
3628     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3629     negptr(result);
3630 
3631     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3632     bind(COMPARE_WIDE_VECTORS_LOOP);
3633 
3634 #ifdef _LP64
3635     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3636       cmpl(cnt2, stride2x2);
3637       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3638       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3639       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3640 
3641       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3642       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3643         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3644         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3645       } else {
3646         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3647         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3648       }
3649       kortestql(mask, mask);
3650       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3651       addptr(result, stride2x2);  // update since we already compared at this addr
3652       subl(cnt2, stride2x2);      // and sub the size too
3653       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3654 
3655       vpxor(vec1, vec1);
3656       jmpb(COMPARE_WIDE_TAIL);
3657     }//if (VM_Version::supports_avx512vlbw())
3658 #endif // _LP64
3659 
3660 
3661     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3662     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3663       vmovdqu(vec1, Address(str1, result, scale));
3664       vpxor(vec1, Address(str2, result, scale));
3665     } else {
3666       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3667       vpxor(vec1, Address(str2, result, scale2));
3668     }
3669     vptest(vec1, vec1);
3670     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3671     addptr(result, stride2);
3672     subl(cnt2, stride2);
3673     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3674     // clean upper bits of YMM registers
3675     vpxor(vec1, vec1);
3676 
3677     // compare wide vectors tail
3678     bind(COMPARE_WIDE_TAIL);
3679     testptr(result, result);
3680     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3681 
3682     movl(result, stride2);
3683     movl(cnt2, result);
3684     negptr(result);
3685     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3686 
3687     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3688     bind(VECTOR_NOT_EQUAL);
3689     // clean upper bits of YMM registers
3690     vpxor(vec1, vec1);
3691     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3692       lea(str1, Address(str1, result, scale));
3693       lea(str2, Address(str2, result, scale));
3694     } else {
3695       lea(str1, Address(str1, result, scale1));
3696       lea(str2, Address(str2, result, scale2));
3697     }
3698     jmp(COMPARE_16_CHARS);
3699 
3700     // Compare tail chars, length between 1 to 15 chars
3701     bind(COMPARE_TAIL_LONG);
3702     movl(cnt2, result);
3703     cmpl(cnt2, stride);
3704     jcc(Assembler::less, COMPARE_SMALL_STR);
3705 
3706     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3707       movdqu(vec1, Address(str1, 0));
3708     } else {
3709       pmovzxbw(vec1, Address(str1, 0));
3710     }
3711     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3712     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3713     subptr(cnt2, stride);
3714     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3715     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3716       lea(str1, Address(str1, result, scale));
3717       lea(str2, Address(str2, result, scale));
3718     } else {
3719       lea(str1, Address(str1, result, scale1));
3720       lea(str2, Address(str2, result, scale2));
3721     }
3722     negptr(cnt2);
3723     jmpb(WHILE_HEAD_LABEL);
3724 
3725     bind(COMPARE_SMALL_STR);
3726   } else if (UseSSE42Intrinsics) {
3727     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3728     int pcmpmask = 0x19;
3729     // Setup to compare 8-char (16-byte) vectors,
3730     // start from first character again because it has aligned address.
3731     movl(result, cnt2);
3732     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3733     if (ae == StrIntrinsicNode::LL) {
3734       pcmpmask &= ~0x01;
3735     }
3736     jcc(Assembler::zero, COMPARE_TAIL);
3737     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3738       lea(str1, Address(str1, result, scale));
3739       lea(str2, Address(str2, result, scale));
3740     } else {
3741       lea(str1, Address(str1, result, scale1));
3742       lea(str2, Address(str2, result, scale2));
3743     }
3744     negptr(result);
3745 
3746     // pcmpestri
3747     //   inputs:
3748     //     vec1- substring
3749     //     rax - negative string length (elements count)
3750     //     mem - scanned string
3751     //     rdx - string length (elements count)
3752     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3753     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3754     //   outputs:
3755     //     rcx - first mismatched element index
3756     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3757 
3758     bind(COMPARE_WIDE_VECTORS);
3759     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3760       movdqu(vec1, Address(str1, result, scale));
3761       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3762     } else {
3763       pmovzxbw(vec1, Address(str1, result, scale1));
3764       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3765     }
3766     // After pcmpestri cnt1(rcx) contains mismatched element index
3767 
3768     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3769     addptr(result, stride);
3770     subptr(cnt2, stride);
3771     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3772 
3773     // compare wide vectors tail
3774     testptr(result, result);
3775     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3776 
3777     movl(cnt2, stride);
3778     movl(result, stride);
3779     negptr(result);
3780     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3781       movdqu(vec1, Address(str1, result, scale));
3782       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3783     } else {
3784       pmovzxbw(vec1, Address(str1, result, scale1));
3785       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3786     }
3787     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3788 
3789     // Mismatched characters in the vectors
3790     bind(VECTOR_NOT_EQUAL);
3791     addptr(cnt1, result);
3792     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3793     subl(result, cnt2);
3794     jmpb(POP_LABEL);
3795 
3796     bind(COMPARE_TAIL); // limit is zero
3797     movl(cnt2, result);
3798     // Fallthru to tail compare
3799   }
3800   // Shift str2 and str1 to the end of the arrays, negate min
3801   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3802     lea(str1, Address(str1, cnt2, scale));
3803     lea(str2, Address(str2, cnt2, scale));
3804   } else {
3805     lea(str1, Address(str1, cnt2, scale1));
3806     lea(str2, Address(str2, cnt2, scale2));
3807   }
3808   decrementl(cnt2);  // first character was compared already
3809   negptr(cnt2);
3810 
3811   // Compare the rest of the elements
3812   bind(WHILE_HEAD_LABEL);
3813   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3814   subl(result, cnt1);
3815   jccb(Assembler::notZero, POP_LABEL);
3816   increment(cnt2);
3817   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3818 
3819   // Strings are equal up to min length.  Return the length difference.
3820   bind(LENGTH_DIFF_LABEL);
3821   pop(result);
3822   if (ae == StrIntrinsicNode::UU) {
3823     // Divide diff by 2 to get number of chars
3824     sarl(result, 1);
3825   }
3826   jmpb(DONE_LABEL);
3827 
3828 #ifdef _LP64
3829   if (VM_Version::supports_avx512vlbw()) {
3830 
3831     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3832 
3833     kmovql(cnt1, mask);
3834     notq(cnt1);
3835     bsfq(cnt2, cnt1);
3836     if (ae != StrIntrinsicNode::LL) {
3837       // Divide diff by 2 to get number of chars
3838       sarl(cnt2, 1);
3839     }
3840     addq(result, cnt2);
3841     if (ae == StrIntrinsicNode::LL) {
3842       load_unsigned_byte(cnt1, Address(str2, result));
3843       load_unsigned_byte(result, Address(str1, result));
3844     } else if (ae == StrIntrinsicNode::UU) {
3845       load_unsigned_short(cnt1, Address(str2, result, scale));
3846       load_unsigned_short(result, Address(str1, result, scale));
3847     } else {
3848       load_unsigned_short(cnt1, Address(str2, result, scale2));
3849       load_unsigned_byte(result, Address(str1, result, scale1));
3850     }
3851     subl(result, cnt1);
3852     jmpb(POP_LABEL);
3853   }//if (VM_Version::supports_avx512vlbw())
3854 #endif // _LP64
3855 
3856   // Discard the stored length difference
3857   bind(POP_LABEL);
3858   pop(cnt1);
3859 
3860   // That's it
3861   bind(DONE_LABEL);
3862   if(ae == StrIntrinsicNode::UL) {
3863     negl(result);
3864   }
3865 
3866 }
3867 
3868 // Search for Non-ASCII character (Negative byte value) in a byte array,
3869 // return the index of the first such character, otherwise the length
3870 // of the array segment searched.
3871 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3872 //   @IntrinsicCandidate
3873 //   public static int countPositives(byte[] ba, int off, int len) {
3874 //     for (int i = off; i < off + len; i++) {
3875 //       if (ba[i] < 0) {
3876 //         return i - off;
3877 //       }
3878 //     }
3879 //     return len;
3880 //   }
3881 void C2_MacroAssembler::count_positives(Register ary1, Register len,
3882   Register result, Register tmp1,
3883   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3884   // rsi: byte array
3885   // rcx: len
3886   // rax: result
3887   ShortBranchVerifier sbv(this);
3888   assert_different_registers(ary1, len, result, tmp1);
3889   assert_different_registers(vec1, vec2);
3890   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3891 
3892   movl(result, len); // copy
3893   // len == 0
3894   testl(len, len);
3895   jcc(Assembler::zero, DONE);
3896 
3897   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3898     VM_Version::supports_avx512vlbw() &&
3899     VM_Version::supports_bmi2()) {
3900 
3901     Label test_64_loop, test_tail, BREAK_LOOP;
3902     movl(tmp1, len);
3903     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3904 
3905     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
3906     andl(len,  0xffffffc0); // vector count (in chars)
3907     jccb(Assembler::zero, test_tail);
3908 
3909     lea(ary1, Address(ary1, len, Address::times_1));
3910     negptr(len);
3911 
3912     bind(test_64_loop);
3913     // Check whether our 64 elements of size byte contain negatives
3914     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3915     kortestql(mask1, mask1);
3916     jcc(Assembler::notZero, BREAK_LOOP);
3917 
3918     addptr(len, 64);
3919     jccb(Assembler::notZero, test_64_loop);
3920 
3921     bind(test_tail);
3922     // bail out when there is nothing to be done
3923     testl(tmp1, -1);
3924     jcc(Assembler::zero, DONE);
3925 
3926 
3927     // check the tail for absense of negatives
3928     // ~(~0 << len) applied up to two times (for 32-bit scenario)
3929 #ifdef _LP64
3930     {
3931       Register tmp3_aliased = len;
3932       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3933       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3934       notq(tmp3_aliased);
3935       kmovql(mask2, tmp3_aliased);
3936     }
3937 #else
3938     Label k_init;
3939     jmp(k_init);
3940 
3941     // We could not read 64-bits from a general purpose register thus we move
3942     // data required to compose 64 1's to the instruction stream
3943     // We emit 64 byte wide series of elements from 0..63 which later on would
3944     // be used as a compare targets with tail count contained in tmp1 register.
3945     // Result would be a k register having tmp1 consecutive number or 1
3946     // counting from least significant bit.
3947     address tmp = pc();
3948     emit_int64(0x0706050403020100);
3949     emit_int64(0x0F0E0D0C0B0A0908);
3950     emit_int64(0x1716151413121110);
3951     emit_int64(0x1F1E1D1C1B1A1918);
3952     emit_int64(0x2726252423222120);
3953     emit_int64(0x2F2E2D2C2B2A2928);
3954     emit_int64(0x3736353433323130);
3955     emit_int64(0x3F3E3D3C3B3A3938);
3956 
3957     bind(k_init);
3958     lea(len, InternalAddress(tmp));
3959     // create mask to test for negative byte inside a vector
3960     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3961     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
3962 
3963 #endif
3964     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3965     ktestq(mask1, mask2);
3966     jcc(Assembler::zero, DONE);
3967 
3968     // do a full check for negative registers in the tail
3969     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
3970                      // ary1 already pointing to the right place
3971     jmpb(TAIL_START);
3972 
3973     bind(BREAK_LOOP);
3974     // At least one byte in the last 64 byte block was negative.
3975     // Set up to look at the last 64 bytes as if they were a tail
3976     lea(ary1, Address(ary1, len, Address::times_1));
3977     addptr(result, len);
3978     // Ignore the very last byte: if all others are positive,
3979     // it must be negative, so we can skip right to the 2+1 byte
3980     // end comparison at this point
3981     orl(result, 63);
3982     movl(len, 63);
3983     // Fallthru to tail compare
3984   } else {
3985 
3986     if (UseAVX >= 2 && UseSSE >= 2) {
3987       // With AVX2, use 32-byte vector compare
3988       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
3989 
3990       // Compare 32-byte vectors
3991       testl(len, 0xffffffe0);   // vector count (in bytes)
3992       jccb(Assembler::zero, TAIL_START);
3993 
3994       andl(len, 0xffffffe0);
3995       lea(ary1, Address(ary1, len, Address::times_1));
3996       negptr(len);
3997 
3998       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
3999       movdl(vec2, tmp1);
4000       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4001 
4002       bind(COMPARE_WIDE_VECTORS);
4003       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4004       vptest(vec1, vec2);
4005       jccb(Assembler::notZero, BREAK_LOOP);
4006       addptr(len, 32);
4007       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4008 
4009       testl(result, 0x0000001f);   // any bytes remaining?
4010       jcc(Assembler::zero, DONE);
4011 
4012       // Quick test using the already prepared vector mask
4013       movl(len, result);
4014       andl(len, 0x0000001f);
4015       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4016       vptest(vec1, vec2);
4017       jcc(Assembler::zero, DONE);
4018       // There are zeros, jump to the tail to determine exactly where
4019       jmpb(TAIL_START);
4020 
4021       bind(BREAK_LOOP);
4022       // At least one byte in the last 32-byte vector is negative.
4023       // Set up to look at the last 32 bytes as if they were a tail
4024       lea(ary1, Address(ary1, len, Address::times_1));
4025       addptr(result, len);
4026       // Ignore the very last byte: if all others are positive,
4027       // it must be negative, so we can skip right to the 2+1 byte
4028       // end comparison at this point
4029       orl(result, 31);
4030       movl(len, 31);
4031       // Fallthru to tail compare
4032     } else if (UseSSE42Intrinsics) {
4033       // With SSE4.2, use double quad vector compare
4034       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4035 
4036       // Compare 16-byte vectors
4037       testl(len, 0xfffffff0);   // vector count (in bytes)
4038       jcc(Assembler::zero, TAIL_START);
4039 
4040       andl(len, 0xfffffff0);
4041       lea(ary1, Address(ary1, len, Address::times_1));
4042       negptr(len);
4043 
4044       movl(tmp1, 0x80808080);
4045       movdl(vec2, tmp1);
4046       pshufd(vec2, vec2, 0);
4047 
4048       bind(COMPARE_WIDE_VECTORS);
4049       movdqu(vec1, Address(ary1, len, Address::times_1));
4050       ptest(vec1, vec2);
4051       jccb(Assembler::notZero, BREAK_LOOP);
4052       addptr(len, 16);
4053       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4054 
4055       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4056       jcc(Assembler::zero, DONE);
4057 
4058       // Quick test using the already prepared vector mask
4059       movl(len, result);
4060       andl(len, 0x0000000f);   // tail count (in bytes)
4061       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4062       ptest(vec1, vec2);
4063       jcc(Assembler::zero, DONE);
4064       jmpb(TAIL_START);
4065 
4066       bind(BREAK_LOOP);
4067       // At least one byte in the last 16-byte vector is negative.
4068       // Set up and look at the last 16 bytes as if they were a tail
4069       lea(ary1, Address(ary1, len, Address::times_1));
4070       addptr(result, len);
4071       // Ignore the very last byte: if all others are positive,
4072       // it must be negative, so we can skip right to the 2+1 byte
4073       // end comparison at this point
4074       orl(result, 15);
4075       movl(len, 15);
4076       // Fallthru to tail compare
4077     }
4078   }
4079 
4080   bind(TAIL_START);
4081   // Compare 4-byte vectors
4082   andl(len, 0xfffffffc); // vector count (in bytes)
4083   jccb(Assembler::zero, COMPARE_CHAR);
4084 
4085   lea(ary1, Address(ary1, len, Address::times_1));
4086   negptr(len);
4087 
4088   bind(COMPARE_VECTORS);
4089   movl(tmp1, Address(ary1, len, Address::times_1));
4090   andl(tmp1, 0x80808080);
4091   jccb(Assembler::notZero, TAIL_ADJUST);
4092   addptr(len, 4);
4093   jccb(Assembler::notZero, COMPARE_VECTORS);
4094 
4095   // Compare trailing char (final 2-3 bytes), if any
4096   bind(COMPARE_CHAR);
4097 
4098   testl(result, 0x2);   // tail  char
4099   jccb(Assembler::zero, COMPARE_BYTE);
4100   load_unsigned_short(tmp1, Address(ary1, 0));
4101   andl(tmp1, 0x00008080);
4102   jccb(Assembler::notZero, CHAR_ADJUST);
4103   lea(ary1, Address(ary1, 2));
4104 
4105   bind(COMPARE_BYTE);
4106   testl(result, 0x1);   // tail  byte
4107   jccb(Assembler::zero, DONE);
4108   load_unsigned_byte(tmp1, Address(ary1, 0));
4109   testl(tmp1, 0x00000080);
4110   jccb(Assembler::zero, DONE);
4111   subptr(result, 1);
4112   jmpb(DONE);
4113 
4114   bind(TAIL_ADJUST);
4115   // there are negative bits in the last 4 byte block.
4116   // Adjust result and check the next three bytes
4117   addptr(result, len);
4118   orl(result, 3);
4119   lea(ary1, Address(ary1, len, Address::times_1));
4120   jmpb(COMPARE_CHAR);
4121 
4122   bind(CHAR_ADJUST);
4123   // We are looking at a char + optional byte tail, and found that one
4124   // of the bytes in the char is negative. Adjust the result, check the
4125   // first byte and readjust if needed.
4126   andl(result, 0xfffffffc);
4127   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4128   jccb(Assembler::notZero, DONE);
4129   addptr(result, 1);
4130 
4131   // That's it
4132   bind(DONE);
4133   if (UseAVX >= 2 && UseSSE >= 2) {
4134     // clean upper bits of YMM registers
4135     vpxor(vec1, vec1);
4136     vpxor(vec2, vec2);
4137   }
4138 }
4139 
4140 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4141 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4142                                       Register limit, Register result, Register chr,
4143                                       XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
4144   ShortBranchVerifier sbv(this);
4145   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4146 
4147   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4148   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4149 
4150   if (is_array_equ) {
4151     // Check the input args
4152     cmpoop(ary1, ary2);
4153     jcc(Assembler::equal, TRUE_LABEL);
4154 
4155     // Need additional checks for arrays_equals.
4156     testptr(ary1, ary1);
4157     jcc(Assembler::zero, FALSE_LABEL);
4158     testptr(ary2, ary2);
4159     jcc(Assembler::zero, FALSE_LABEL);
4160 
4161     // Check the lengths
4162     movl(limit, Address(ary1, length_offset));
4163     cmpl(limit, Address(ary2, length_offset));
4164     jcc(Assembler::notEqual, FALSE_LABEL);
4165   }
4166 
4167   // count == 0
4168   testl(limit, limit);
4169   jcc(Assembler::zero, TRUE_LABEL);
4170 
4171   if (is_array_equ) {
4172     // Load array address
4173     lea(ary1, Address(ary1, base_offset));
4174     lea(ary2, Address(ary2, base_offset));
4175   }
4176 
4177   if (is_array_equ && is_char) {
4178     // arrays_equals when used for char[].
4179     shll(limit, 1);      // byte count != 0
4180   }
4181   movl(result, limit); // copy
4182 
4183   if (UseAVX >= 2) {
4184     // With AVX2, use 32-byte vector compare
4185     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4186 
4187     // Compare 32-byte vectors
4188     andl(result, 0x0000001f);  //   tail count (in bytes)
4189     andl(limit, 0xffffffe0);   // vector count (in bytes)
4190     jcc(Assembler::zero, COMPARE_TAIL);
4191 
4192     lea(ary1, Address(ary1, limit, Address::times_1));
4193     lea(ary2, Address(ary2, limit, Address::times_1));
4194     negptr(limit);
4195 
4196 #ifdef _LP64
4197     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4198       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4199 
4200       cmpl(limit, -64);
4201       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4202 
4203       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4204 
4205       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4206       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4207       kortestql(mask, mask);
4208       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4209       addptr(limit, 64);  // update since we already compared at this addr
4210       cmpl(limit, -64);
4211       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4212 
4213       // At this point we may still need to compare -limit+result bytes.
4214       // We could execute the next two instruction and just continue via non-wide path:
4215       //  cmpl(limit, 0);
4216       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4217       // But since we stopped at the points ary{1,2}+limit which are
4218       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4219       // (|limit| <= 32 and result < 32),
4220       // we may just compare the last 64 bytes.
4221       //
4222       addptr(result, -64);   // it is safe, bc we just came from this area
4223       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4224       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4225       kortestql(mask, mask);
4226       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4227 
4228       jmp(TRUE_LABEL);
4229 
4230       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4231 
4232     }//if (VM_Version::supports_avx512vlbw())
4233 #endif //_LP64
4234     bind(COMPARE_WIDE_VECTORS);
4235     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
4236     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4237     vpxor(vec1, vec2);
4238 
4239     vptest(vec1, vec1);
4240     jcc(Assembler::notZero, FALSE_LABEL);
4241     addptr(limit, 32);
4242     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4243 
4244     testl(result, result);
4245     jcc(Assembler::zero, TRUE_LABEL);
4246 
4247     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
4248     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4249     vpxor(vec1, vec2);
4250 
4251     vptest(vec1, vec1);
4252     jccb(Assembler::notZero, FALSE_LABEL);
4253     jmpb(TRUE_LABEL);
4254 
4255     bind(COMPARE_TAIL); // limit is zero
4256     movl(limit, result);
4257     // Fallthru to tail compare
4258   } else if (UseSSE42Intrinsics) {
4259     // With SSE4.2, use double quad vector compare
4260     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4261 
4262     // Compare 16-byte vectors
4263     andl(result, 0x0000000f);  //   tail count (in bytes)
4264     andl(limit, 0xfffffff0);   // vector count (in bytes)
4265     jcc(Assembler::zero, COMPARE_TAIL);
4266 
4267     lea(ary1, Address(ary1, limit, Address::times_1));
4268     lea(ary2, Address(ary2, limit, Address::times_1));
4269     negptr(limit);
4270 
4271     bind(COMPARE_WIDE_VECTORS);
4272     movdqu(vec1, Address(ary1, limit, Address::times_1));
4273     movdqu(vec2, Address(ary2, limit, Address::times_1));
4274     pxor(vec1, vec2);
4275 
4276     ptest(vec1, vec1);
4277     jcc(Assembler::notZero, FALSE_LABEL);
4278     addptr(limit, 16);
4279     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4280 
4281     testl(result, result);
4282     jcc(Assembler::zero, TRUE_LABEL);
4283 
4284     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4285     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4286     pxor(vec1, vec2);
4287 
4288     ptest(vec1, vec1);
4289     jccb(Assembler::notZero, FALSE_LABEL);
4290     jmpb(TRUE_LABEL);
4291 
4292     bind(COMPARE_TAIL); // limit is zero
4293     movl(limit, result);
4294     // Fallthru to tail compare
4295   }
4296 
4297   // Compare 4-byte vectors
4298   andl(limit, 0xfffffffc); // vector count (in bytes)
4299   jccb(Assembler::zero, COMPARE_CHAR);
4300 
4301   lea(ary1, Address(ary1, limit, Address::times_1));
4302   lea(ary2, Address(ary2, limit, Address::times_1));
4303   negptr(limit);
4304 
4305   bind(COMPARE_VECTORS);
4306   movl(chr, Address(ary1, limit, Address::times_1));
4307   cmpl(chr, Address(ary2, limit, Address::times_1));
4308   jccb(Assembler::notEqual, FALSE_LABEL);
4309   addptr(limit, 4);
4310   jcc(Assembler::notZero, COMPARE_VECTORS);
4311 
4312   // Compare trailing char (final 2 bytes), if any
4313   bind(COMPARE_CHAR);
4314   testl(result, 0x2);   // tail  char
4315   jccb(Assembler::zero, COMPARE_BYTE);
4316   load_unsigned_short(chr, Address(ary1, 0));
4317   load_unsigned_short(limit, Address(ary2, 0));
4318   cmpl(chr, limit);
4319   jccb(Assembler::notEqual, FALSE_LABEL);
4320 
4321   if (is_array_equ && is_char) {
4322     bind(COMPARE_BYTE);
4323   } else {
4324     lea(ary1, Address(ary1, 2));
4325     lea(ary2, Address(ary2, 2));
4326 
4327     bind(COMPARE_BYTE);
4328     testl(result, 0x1);   // tail  byte
4329     jccb(Assembler::zero, TRUE_LABEL);
4330     load_unsigned_byte(chr, Address(ary1, 0));
4331     load_unsigned_byte(limit, Address(ary2, 0));
4332     cmpl(chr, limit);
4333     jccb(Assembler::notEqual, FALSE_LABEL);
4334   }
4335   bind(TRUE_LABEL);
4336   movl(result, 1);   // return true
4337   jmpb(DONE);
4338 
4339   bind(FALSE_LABEL);
4340   xorl(result, result); // return false
4341 
4342   // That's it
4343   bind(DONE);
4344   if (UseAVX >= 2) {
4345     // clean upper bits of YMM registers
4346     vpxor(vec1, vec1);
4347     vpxor(vec2, vec2);
4348   }
4349 }
4350 
4351 #ifdef _LP64
4352 
4353 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4354 #define __ masm.
4355   Register dst = stub.data<0>();
4356   XMMRegister src = stub.data<1>();
4357   address target = stub.data<2>();
4358   __ bind(stub.entry());
4359   __ subptr(rsp, 8);
4360   __ movdbl(Address(rsp), src);
4361   __ call(RuntimeAddress(target));
4362   __ pop(dst);
4363   __ jmp(stub.continuation());
4364 #undef __
4365 }
4366 
4367 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4368   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4369   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4370 
4371   address slowpath_target;
4372   if (dst_bt == T_INT) {
4373     if (src_bt == T_FLOAT) {
4374       cvttss2sil(dst, src);
4375       cmpl(dst, 0x80000000);
4376       slowpath_target = StubRoutines::x86::f2i_fixup();
4377     } else {
4378       cvttsd2sil(dst, src);
4379       cmpl(dst, 0x80000000);
4380       slowpath_target = StubRoutines::x86::d2i_fixup();
4381     }
4382   } else {
4383     if (src_bt == T_FLOAT) {
4384       cvttss2siq(dst, src);
4385       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4386       slowpath_target = StubRoutines::x86::f2l_fixup();
4387     } else {
4388       cvttsd2siq(dst, src);
4389       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4390       slowpath_target = StubRoutines::x86::d2l_fixup();
4391     }
4392   }
4393 
4394   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4395   jcc(Assembler::equal, stub->entry());
4396   bind(stub->continuation());
4397 }
4398 
4399 #endif // _LP64
4400 
4401 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4402                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4403   switch(ideal_opc) {
4404     case Op_LShiftVS:
4405       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4406     case Op_LShiftVI:
4407       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4408     case Op_LShiftVL:
4409       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4410     case Op_RShiftVS:
4411       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4412     case Op_RShiftVI:
4413       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4414     case Op_RShiftVL:
4415       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4416     case Op_URShiftVS:
4417       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4418     case Op_URShiftVI:
4419       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4420     case Op_URShiftVL:
4421       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4422     case Op_RotateRightV:
4423       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4424     case Op_RotateLeftV:
4425       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4426     default:
4427       fatal("Unsupported masked operation"); break;
4428   }
4429 }
4430 
4431 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4432                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4433                                     bool is_varshift) {
4434   switch (ideal_opc) {
4435     case Op_AddVB:
4436       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4437     case Op_AddVS:
4438       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4439     case Op_AddVI:
4440       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4441     case Op_AddVL:
4442       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4443     case Op_AddVF:
4444       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4445     case Op_AddVD:
4446       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4447     case Op_SubVB:
4448       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4449     case Op_SubVS:
4450       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4451     case Op_SubVI:
4452       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4453     case Op_SubVL:
4454       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4455     case Op_SubVF:
4456       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4457     case Op_SubVD:
4458       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4459     case Op_MulVS:
4460       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4461     case Op_MulVI:
4462       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4463     case Op_MulVL:
4464       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4465     case Op_MulVF:
4466       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4467     case Op_MulVD:
4468       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4469     case Op_DivVF:
4470       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4471     case Op_DivVD:
4472       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4473     case Op_SqrtVF:
4474       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4475     case Op_SqrtVD:
4476       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4477     case Op_AbsVB:
4478       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4479     case Op_AbsVS:
4480       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4481     case Op_AbsVI:
4482       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4483     case Op_AbsVL:
4484       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4485     case Op_FmaVF:
4486       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4487     case Op_FmaVD:
4488       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4489     case Op_VectorRearrange:
4490       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4491     case Op_LShiftVS:
4492       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4493     case Op_LShiftVI:
4494       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4495     case Op_LShiftVL:
4496       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4497     case Op_RShiftVS:
4498       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4499     case Op_RShiftVI:
4500       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4501     case Op_RShiftVL:
4502       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4503     case Op_URShiftVS:
4504       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4505     case Op_URShiftVI:
4506       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4507     case Op_URShiftVL:
4508       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4509     case Op_RotateLeftV:
4510       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4511     case Op_RotateRightV:
4512       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4513     case Op_MaxV:
4514       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4515     case Op_MinV:
4516       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4517     case Op_XorV:
4518       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4519     case Op_OrV:
4520       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4521     case Op_AndV:
4522       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4523     default:
4524       fatal("Unsupported masked operation"); break;
4525   }
4526 }
4527 
4528 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4529                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4530   switch (ideal_opc) {
4531     case Op_AddVB:
4532       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4533     case Op_AddVS:
4534       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4535     case Op_AddVI:
4536       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4537     case Op_AddVL:
4538       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4539     case Op_AddVF:
4540       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4541     case Op_AddVD:
4542       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4543     case Op_SubVB:
4544       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4545     case Op_SubVS:
4546       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4547     case Op_SubVI:
4548       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4549     case Op_SubVL:
4550       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4551     case Op_SubVF:
4552       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4553     case Op_SubVD:
4554       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4555     case Op_MulVS:
4556       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4557     case Op_MulVI:
4558       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4559     case Op_MulVL:
4560       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4561     case Op_MulVF:
4562       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4563     case Op_MulVD:
4564       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4565     case Op_DivVF:
4566       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4567     case Op_DivVD:
4568       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4569     case Op_FmaVF:
4570       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4571     case Op_FmaVD:
4572       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4573     case Op_MaxV:
4574       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4575     case Op_MinV:
4576       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4577     case Op_XorV:
4578       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4579     case Op_OrV:
4580       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4581     case Op_AndV:
4582       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4583     default:
4584       fatal("Unsupported masked operation"); break;
4585   }
4586 }
4587 
4588 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4589                                   KRegister src1, KRegister src2) {
4590   BasicType etype = T_ILLEGAL;
4591   switch(mask_len) {
4592     case 2:
4593     case 4:
4594     case 8:  etype = T_BYTE; break;
4595     case 16: etype = T_SHORT; break;
4596     case 32: etype = T_INT; break;
4597     case 64: etype = T_LONG; break;
4598     default: fatal("Unsupported type"); break;
4599   }
4600   assert(etype != T_ILLEGAL, "");
4601   switch(ideal_opc) {
4602     case Op_AndVMask:
4603       kand(etype, dst, src1, src2); break;
4604     case Op_OrVMask:
4605       kor(etype, dst, src1, src2); break;
4606     case Op_XorVMask:
4607       kxor(etype, dst, src1, src2); break;
4608     default:
4609       fatal("Unsupported masked operation"); break;
4610   }
4611 }
4612 
4613 /*
4614  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4615  * If src is NaN, the result is 0.
4616  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4617  * the result is equal to the value of Integer.MIN_VALUE.
4618  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4619  * the result is equal to the value of Integer.MAX_VALUE.
4620  */
4621 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4622                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4623                                                                    Register rscratch, AddressLiteral float_sign_flip,
4624                                                                    int vec_enc) {
4625   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4626   Label done;
4627   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4628   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4629   vptest(xtmp2, xtmp2, vec_enc);
4630   jccb(Assembler::equal, done);
4631 
4632   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4633   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4634 
4635   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4636   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4637   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4638 
4639   // Recompute the mask for remaining special value.
4640   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4641   // Extract SRC values corresponding to TRUE mask lanes.
4642   vpand(xtmp4, xtmp2, src, vec_enc);
4643   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4644   // values are set.
4645   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4646 
4647   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4648   bind(done);
4649 }
4650 
4651 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4652                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4653                                                                     Register rscratch, AddressLiteral float_sign_flip,
4654                                                                     int vec_enc) {
4655   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4656   Label done;
4657   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4658   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4659   kortestwl(ktmp1, ktmp1);
4660   jccb(Assembler::equal, done);
4661 
4662   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4663   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4664   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4665 
4666   kxorwl(ktmp1, ktmp1, ktmp2);
4667   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4668   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4669   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4670   bind(done);
4671 }
4672 
4673 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4674                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4675                                                                      Register rscratch, AddressLiteral double_sign_flip,
4676                                                                      int vec_enc) {
4677   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4678 
4679   Label done;
4680   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4681   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4682   kortestwl(ktmp1, ktmp1);
4683   jccb(Assembler::equal, done);
4684 
4685   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4686   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4687   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4688 
4689   kxorwl(ktmp1, ktmp1, ktmp2);
4690   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4691   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4692   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4693   bind(done);
4694 }
4695 
4696 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4697                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4698                                                                      Register rscratch, AddressLiteral float_sign_flip,
4699                                                                      int vec_enc) {
4700   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4701   Label done;
4702   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4703   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4704   kortestwl(ktmp1, ktmp1);
4705   jccb(Assembler::equal, done);
4706 
4707   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4708   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4709   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4710 
4711   kxorwl(ktmp1, ktmp1, ktmp2);
4712   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4713   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4714   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4715   bind(done);
4716 }
4717 
4718 /*
4719  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4720  * If src is NaN, the result is 0.
4721  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4722  * the result is equal to the value of Long.MIN_VALUE.
4723  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4724  * the result is equal to the value of Long.MAX_VALUE.
4725  */
4726 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4727                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4728                                                                       Register rscratch, AddressLiteral double_sign_flip,
4729                                                                       int vec_enc) {
4730   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4731 
4732   Label done;
4733   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4734   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4735   kortestwl(ktmp1, ktmp1);
4736   jccb(Assembler::equal, done);
4737 
4738   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4739   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4740   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4741 
4742   kxorwl(ktmp1, ktmp1, ktmp2);
4743   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4744   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4745   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4746   bind(done);
4747 }
4748 
4749 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
4750                                                              XMMRegister xtmp, int index, int vec_enc) {
4751    assert(vec_enc < Assembler::AVX_512bit, "");
4752    if (vec_enc == Assembler::AVX_256bit) {
4753      vextractf128_high(xtmp, src);
4754      vshufps(dst, src, xtmp, index, vec_enc);
4755    } else {
4756      vshufps(dst, src, zero, index, vec_enc);
4757    }
4758 }
4759 
4760 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4761                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
4762                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
4763   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4764 
4765   Label done;
4766   // Compare the destination lanes with float_sign_flip
4767   // value to get mask for all special values.
4768   movdqu(xtmp1, float_sign_flip, rscratch);
4769   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
4770   ptest(xtmp2, xtmp2);
4771   jccb(Assembler::equal, done);
4772 
4773   // Flip float_sign_flip to get max integer value.
4774   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
4775   pxor(xtmp1, xtmp4);
4776 
4777   // Set detination lanes corresponding to unordered source lanes as zero.
4778   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
4779   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
4780 
4781   // Shuffle mask vector and pack lower doubles word from each quadword lane.
4782   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
4783   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
4784 
4785   // Recompute the mask for remaining special value.
4786   pxor(xtmp2, xtmp3);
4787   // Extract mask corresponding to non-negative source lanes.
4788   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
4789 
4790   // Shuffle mask vector and pack lower doubles word from each quadword lane.
4791   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
4792   pand(xtmp3, xtmp2);
4793 
4794   // Replace destination lanes holding special value(0x80000000) with max int
4795   // if corresponding source lane holds a +ve value.
4796   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
4797   bind(done);
4798 }
4799 
4800 
4801 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
4802                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
4803   switch(to_elem_bt) {
4804     case T_SHORT:
4805       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
4806       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
4807       vpackusdw(dst, dst, zero, vec_enc);
4808       if (vec_enc == Assembler::AVX_256bit) {
4809         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
4810       }
4811       break;
4812     case  T_BYTE:
4813       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
4814       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
4815       vpackusdw(dst, dst, zero, vec_enc);
4816       if (vec_enc == Assembler::AVX_256bit) {
4817         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
4818       }
4819       vpackuswb(dst, dst, zero, vec_enc);
4820       break;
4821     default: assert(false, "%s", type2name(to_elem_bt));
4822   }
4823 }
4824 
4825 /*
4826  * Algorithm for vector D2L and F2I conversions:-
4827  * a) Perform vector D2L/F2I cast.
4828  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
4829  *    It signifies that source value could be any of the special floating point
4830  *    values(NaN,-Inf,Inf,Max,-Min).
4831  * c) Set destination to zero if source is NaN value.
4832  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
4833  */
4834 
4835 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4836                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4837                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
4838   int to_elem_sz = type2aelembytes(to_elem_bt);
4839   assert(to_elem_sz <= 4, "");
4840   vcvttps2dq(dst, src, vec_enc);
4841   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
4842   if (to_elem_sz < 4) {
4843     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4844     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
4845   }
4846 }
4847 
4848 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4849                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
4850                                             Register rscratch, int vec_enc) {
4851   int to_elem_sz = type2aelembytes(to_elem_bt);
4852   assert(to_elem_sz <= 4, "");
4853   vcvttps2dq(dst, src, vec_enc);
4854   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
4855   switch(to_elem_bt) {
4856     case T_INT:
4857       break;
4858     case T_SHORT:
4859       evpmovdw(dst, dst, vec_enc);
4860       break;
4861     case T_BYTE:
4862       evpmovdb(dst, dst, vec_enc);
4863       break;
4864     default: assert(false, "%s", type2name(to_elem_bt));
4865   }
4866 }
4867 
4868 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4869                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
4870                                             Register rscratch, int vec_enc) {
4871   evcvttps2qq(dst, src, vec_enc);
4872   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
4873 }
4874 
4875 // Handling for downcasting from double to integer or sub-word types on AVX2.
4876 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4877                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
4878                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
4879   int to_elem_sz = type2aelembytes(to_elem_bt);
4880   assert(to_elem_sz < 8, "");
4881   vcvttpd2dq(dst, src, vec_enc);
4882   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
4883                                               float_sign_flip, vec_enc);
4884   if (to_elem_sz < 4) {
4885     // xtmp4 holds all zero lanes.
4886     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
4887   }
4888 }
4889 
4890 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
4891                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
4892                                             KRegister ktmp2, AddressLiteral sign_flip,
4893                                             Register rscratch, int vec_enc) {
4894   if (VM_Version::supports_avx512dq()) {
4895     evcvttpd2qq(dst, src, vec_enc);
4896     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
4897     switch(to_elem_bt) {
4898       case T_LONG:
4899         break;
4900       case T_INT:
4901         evpmovsqd(dst, dst, vec_enc);
4902         break;
4903       case T_SHORT:
4904         evpmovsqd(dst, dst, vec_enc);
4905         evpmovdw(dst, dst, vec_enc);
4906         break;
4907       case T_BYTE:
4908         evpmovsqd(dst, dst, vec_enc);
4909         evpmovdb(dst, dst, vec_enc);
4910         break;
4911       default: assert(false, "%s", type2name(to_elem_bt));
4912     }
4913   } else {
4914     assert(type2aelembytes(to_elem_bt) <= 4, "");
4915     vcvttpd2dq(dst, src, vec_enc);
4916     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
4917     switch(to_elem_bt) {
4918       case T_INT:
4919         break;
4920       case T_SHORT:
4921         evpmovdw(dst, dst, vec_enc);
4922         break;
4923       case T_BYTE:
4924         evpmovdb(dst, dst, vec_enc);
4925         break;
4926       default: assert(false, "%s", type2name(to_elem_bt));
4927     }
4928   }
4929 }
4930 
4931 #ifdef _LP64
4932 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
4933                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4934                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
4935   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4936   // and re-instantiate original MXCSR.RC mode after that.
4937   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4938 
4939   mov64(tmp, julong_cast(0.5L));
4940   evpbroadcastq(xtmp1, tmp, vec_enc);
4941   vaddpd(xtmp1, src , xtmp1, vec_enc);
4942   evcvtpd2qq(dst, xtmp1, vec_enc);
4943   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
4944                                                 double_sign_flip, vec_enc);;
4945 
4946   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4947 }
4948 
4949 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
4950                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4951                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
4952   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4953   // and re-instantiate original MXCSR.RC mode after that.
4954   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4955 
4956   movl(tmp, jint_cast(0.5));
4957   movq(xtmp1, tmp);
4958   vbroadcastss(xtmp1, xtmp1, vec_enc);
4959   vaddps(xtmp1, src , xtmp1, vec_enc);
4960   vcvtps2dq(dst, xtmp1, vec_enc);
4961   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
4962                                               float_sign_flip, vec_enc);
4963 
4964   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4965 }
4966 
4967 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
4968                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4969                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
4970   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4971   // and re-instantiate original MXCSR.RC mode after that.
4972   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4973 
4974   movl(tmp, jint_cast(0.5));
4975   movq(xtmp1, tmp);
4976   vbroadcastss(xtmp1, xtmp1, vec_enc);
4977   vaddps(xtmp1, src , xtmp1, vec_enc);
4978   vcvtps2dq(dst, xtmp1, vec_enc);
4979   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
4980 
4981   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4982 }
4983 #endif // _LP64
4984 
4985 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
4986                                              BasicType from_elem_bt, BasicType to_elem_bt) {
4987   switch (from_elem_bt) {
4988     case T_BYTE:
4989       switch (to_elem_bt) {
4990         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
4991         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
4992         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
4993         default: ShouldNotReachHere();
4994       }
4995       break;
4996     case T_SHORT:
4997       switch (to_elem_bt) {
4998         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
4999         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5000         default: ShouldNotReachHere();
5001       }
5002       break;
5003     case T_INT:
5004       assert(to_elem_bt == T_LONG, "");
5005       vpmovzxdq(dst, src, vlen_enc);
5006       break;
5007     default:
5008       ShouldNotReachHere();
5009   }
5010 }
5011 
5012 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5013                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5014   switch (from_elem_bt) {
5015     case T_BYTE:
5016       switch (to_elem_bt) {
5017         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5018         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5019         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5020         default: ShouldNotReachHere();
5021       }
5022       break;
5023     case T_SHORT:
5024       switch (to_elem_bt) {
5025         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5026         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5027         default: ShouldNotReachHere();
5028       }
5029       break;
5030     case T_INT:
5031       assert(to_elem_bt == T_LONG, "");
5032       vpmovsxdq(dst, src, vlen_enc);
5033       break;
5034     default:
5035       ShouldNotReachHere();
5036   }
5037 }
5038 
5039 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5040                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5041   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5042   assert(vlen_enc != AVX_512bit, "");
5043 
5044   int dst_bt_size = type2aelembytes(dst_bt);
5045   int src_bt_size = type2aelembytes(src_bt);
5046   if (dst_bt_size > src_bt_size) {
5047     switch (dst_bt_size / src_bt_size) {
5048       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5049       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5050       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5051       default: ShouldNotReachHere();
5052     }
5053   } else {
5054     assert(dst_bt_size < src_bt_size, "");
5055     switch (src_bt_size / dst_bt_size) {
5056       case 2: {
5057         if (vlen_enc == AVX_128bit) {
5058           vpacksswb(dst, src, src, vlen_enc);
5059         } else {
5060           vpacksswb(dst, src, src, vlen_enc);
5061           vpermq(dst, dst, 0x08, vlen_enc);
5062         }
5063         break;
5064       }
5065       case 4: {
5066         if (vlen_enc == AVX_128bit) {
5067           vpackssdw(dst, src, src, vlen_enc);
5068           vpacksswb(dst, dst, dst, vlen_enc);
5069         } else {
5070           vpackssdw(dst, src, src, vlen_enc);
5071           vpermq(dst, dst, 0x08, vlen_enc);
5072           vpacksswb(dst, dst, dst, AVX_128bit);
5073         }
5074         break;
5075       }
5076       case 8: {
5077         if (vlen_enc == AVX_128bit) {
5078           vpshufd(dst, src, 0x08, vlen_enc);
5079           vpackssdw(dst, dst, dst, vlen_enc);
5080           vpacksswb(dst, dst, dst, vlen_enc);
5081         } else {
5082           vpshufd(dst, src, 0x08, vlen_enc);
5083           vpermq(dst, dst, 0x08, vlen_enc);
5084           vpackssdw(dst, dst, dst, AVX_128bit);
5085           vpacksswb(dst, dst, dst, AVX_128bit);
5086         }
5087         break;
5088       }
5089       default: ShouldNotReachHere();
5090     }
5091   }
5092 }
5093 
5094 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5095                                    bool merge, BasicType bt, int vlen_enc) {
5096   if (bt == T_INT) {
5097     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5098   } else {
5099     assert(bt == T_LONG, "");
5100     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5101   }
5102 }
5103 
5104 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5105                                    bool merge, BasicType bt, int vlen_enc) {
5106   if (bt == T_INT) {
5107     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5108   } else {
5109     assert(bt == T_LONG, "");
5110     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5111   }
5112 }
5113 
5114 #ifdef _LP64
5115 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5116                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5117                                                int vec_enc) {
5118   int index = 0;
5119   int vindex = 0;
5120   mov64(rtmp1, 0x0101010101010101L);
5121   pdepq(rtmp1, src, rtmp1);
5122   if (mask_len > 8) {
5123     movq(rtmp2, src);
5124     vpxor(xtmp, xtmp, xtmp, vec_enc);
5125     movq(xtmp, rtmp1);
5126   }
5127   movq(dst, rtmp1);
5128 
5129   mask_len -= 8;
5130   while (mask_len > 0) {
5131     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5132     index++;
5133     if ((index % 2) == 0) {
5134       pxor(xtmp, xtmp);
5135     }
5136     mov64(rtmp1, 0x0101010101010101L);
5137     shrq(rtmp2, 8);
5138     pdepq(rtmp1, rtmp2, rtmp1);
5139     pinsrq(xtmp, rtmp1, index % 2);
5140     vindex = index / 2;
5141     if (vindex) {
5142       // Write entire 16 byte vector when both 64 bit
5143       // lanes are update to save redundant instructions.
5144       if (index % 2) {
5145         vinsertf128(dst, dst, xtmp, vindex);
5146       }
5147     } else {
5148       vmovdqu(dst, xtmp);
5149     }
5150     mask_len -= 8;
5151   }
5152 }
5153 
5154 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5155   switch(opc) {
5156     case Op_VectorMaskTrueCount:
5157       popcntq(dst, tmp);
5158       break;
5159     case Op_VectorMaskLastTrue:
5160       if (VM_Version::supports_lzcnt()) {
5161         lzcntq(tmp, tmp);
5162         movl(dst, 63);
5163         subl(dst, tmp);
5164       } else {
5165         movl(dst, -1);
5166         bsrq(tmp, tmp);
5167         cmov32(Assembler::notZero, dst, tmp);
5168       }
5169       break;
5170     case Op_VectorMaskFirstTrue:
5171       if (VM_Version::supports_bmi1()) {
5172         if (masklen < 32) {
5173           orl(tmp, 1 << masklen);
5174           tzcntl(dst, tmp);
5175         } else if (masklen == 32) {
5176           tzcntl(dst, tmp);
5177         } else {
5178           assert(masklen == 64, "");
5179           tzcntq(dst, tmp);
5180         }
5181       } else {
5182         if (masklen < 32) {
5183           orl(tmp, 1 << masklen);
5184           bsfl(dst, tmp);
5185         } else {
5186           assert(masklen == 32 || masklen == 64, "");
5187           movl(dst, masklen);
5188           if (masklen == 32)  {
5189             bsfl(tmp, tmp);
5190           } else {
5191             bsfq(tmp, tmp);
5192           }
5193           cmov32(Assembler::notZero, dst, tmp);
5194         }
5195       }
5196       break;
5197     case Op_VectorMaskToLong:
5198       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5199       break;
5200     default: assert(false, "Unhandled mask operation");
5201   }
5202 }
5203 
5204 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5205                                               int masklen, int masksize, int vec_enc) {
5206   assert(VM_Version::supports_popcnt(), "");
5207 
5208   if(VM_Version::supports_avx512bw()) {
5209     kmovql(tmp, mask);
5210   } else {
5211     assert(masklen <= 16, "");
5212     kmovwl(tmp, mask);
5213   }
5214 
5215   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5216   // operations needs to be clipped.
5217   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5218     andq(tmp, (1 << masklen) - 1);
5219   }
5220 
5221   vector_mask_operation_helper(opc, dst, tmp, masklen);
5222 }
5223 
5224 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5225                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5226   assert(vec_enc == AVX_128bit && VM_Version::supports_avx() ||
5227          vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4), "");
5228   assert(VM_Version::supports_popcnt(), "");
5229 
5230   bool need_clip = false;
5231   switch(bt) {
5232     case T_BOOLEAN:
5233       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5234       vpxor(xtmp, xtmp, xtmp, vec_enc);
5235       vpsubb(xtmp, xtmp, mask, vec_enc);
5236       vpmovmskb(tmp, xtmp, vec_enc);
5237       need_clip = masklen < 16;
5238       break;
5239     case T_BYTE:
5240       vpmovmskb(tmp, mask, vec_enc);
5241       need_clip = masklen < 16;
5242       break;
5243     case T_SHORT:
5244       vpacksswb(xtmp, mask, mask, vec_enc);
5245       if (masklen >= 16) {
5246         vpermpd(xtmp, xtmp, 8, vec_enc);
5247       }
5248       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5249       need_clip = masklen < 16;
5250       break;
5251     case T_INT:
5252     case T_FLOAT:
5253       vmovmskps(tmp, mask, vec_enc);
5254       need_clip = masklen < 4;
5255       break;
5256     case T_LONG:
5257     case T_DOUBLE:
5258       vmovmskpd(tmp, mask, vec_enc);
5259       need_clip = masklen < 2;
5260       break;
5261     default: assert(false, "Unhandled type, %s", type2name(bt));
5262   }
5263 
5264   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5265   // operations needs to be clipped.
5266   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5267     // need_clip implies masklen < 32
5268     andq(tmp, (1 << masklen) - 1);
5269   }
5270 
5271   vector_mask_operation_helper(opc, dst, tmp, masklen);
5272 }
5273 
5274 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5275                                              Register rtmp2, int mask_len) {
5276   kmov(rtmp1, src);
5277   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5278   mov64(rtmp2, -1L);
5279   pextq(rtmp2, rtmp2, rtmp1);
5280   kmov(dst, rtmp2);
5281 }
5282 
5283 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5284                                                bool merge, BasicType bt, int vec_enc) {
5285   if (opcode == Op_CompressV) {
5286     switch(bt) {
5287     case T_BYTE:
5288       evpcompressb(dst, mask, src, merge, vec_enc);
5289       break;
5290     case T_CHAR:
5291     case T_SHORT:
5292       evpcompressw(dst, mask, src, merge, vec_enc);
5293       break;
5294     case T_INT:
5295       evpcompressd(dst, mask, src, merge, vec_enc);
5296       break;
5297     case T_FLOAT:
5298       evcompressps(dst, mask, src, merge, vec_enc);
5299       break;
5300     case T_LONG:
5301       evpcompressq(dst, mask, src, merge, vec_enc);
5302       break;
5303     case T_DOUBLE:
5304       evcompresspd(dst, mask, src, merge, vec_enc);
5305       break;
5306     default:
5307       fatal("Unsupported type %s", type2name(bt));
5308       break;
5309     }
5310   } else {
5311     assert(opcode == Op_ExpandV, "");
5312     switch(bt) {
5313     case T_BYTE:
5314       evpexpandb(dst, mask, src, merge, vec_enc);
5315       break;
5316     case T_CHAR:
5317     case T_SHORT:
5318       evpexpandw(dst, mask, src, merge, vec_enc);
5319       break;
5320     case T_INT:
5321       evpexpandd(dst, mask, src, merge, vec_enc);
5322       break;
5323     case T_FLOAT:
5324       evexpandps(dst, mask, src, merge, vec_enc);
5325       break;
5326     case T_LONG:
5327       evpexpandq(dst, mask, src, merge, vec_enc);
5328       break;
5329     case T_DOUBLE:
5330       evexpandpd(dst, mask, src, merge, vec_enc);
5331       break;
5332     default:
5333       fatal("Unsupported type %s", type2name(bt));
5334       break;
5335     }
5336   }
5337 }
5338 #endif
5339 
5340 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5341                                            KRegister ktmp1, int vec_enc) {
5342   if (opcode == Op_SignumVD) {
5343     vsubpd(dst, zero, one, vec_enc);
5344     // if src < 0 ? -1 : 1
5345     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5346     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5347     // if src == NaN, -0.0 or 0.0 return src.
5348     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5349     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5350   } else {
5351     assert(opcode == Op_SignumVF, "");
5352     vsubps(dst, zero, one, vec_enc);
5353     // if src < 0 ? -1 : 1
5354     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5355     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5356     // if src == NaN, -0.0 or 0.0 return src.
5357     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5358     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5359   }
5360 }
5361 
5362 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5363                                           XMMRegister xtmp1, int vec_enc) {
5364   if (opcode == Op_SignumVD) {
5365     vsubpd(dst, zero, one, vec_enc);
5366     // if src < 0 ? -1 : 1
5367     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5368     // if src == NaN, -0.0 or 0.0 return src.
5369     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5370     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5371   } else {
5372     assert(opcode == Op_SignumVF, "");
5373     vsubps(dst, zero, one, vec_enc);
5374     // if src < 0 ? -1 : 1
5375     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5376     // if src == NaN, -0.0 or 0.0 return src.
5377     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5378     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5379   }
5380 }
5381 
5382 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5383   if (VM_Version::supports_avx512bw()) {
5384     if (mask_len > 32) {
5385       kmovql(dst, src);
5386     } else {
5387       kmovdl(dst, src);
5388       if (mask_len != 32) {
5389         kshiftrdl(dst, dst, 32 - mask_len);
5390       }
5391     }
5392   } else {
5393     assert(mask_len <= 16, "");
5394     kmovwl(dst, src);
5395     if (mask_len != 16) {
5396       kshiftrwl(dst, dst, 16 - mask_len);
5397     }
5398   }
5399 }
5400 
5401 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5402   int lane_size = type2aelembytes(bt);
5403   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5404   if ((is_LP64 || lane_size < 8) &&
5405       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5406        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5407     movptr(rtmp, imm32);
5408     switch(lane_size) {
5409       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5410       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5411       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5412       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5413       fatal("Unsupported lane size %d", lane_size);
5414       break;
5415     }
5416   } else {
5417     movptr(rtmp, imm32);
5418     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5419     switch(lane_size) {
5420       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5421       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5422       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5423       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5424       fatal("Unsupported lane size %d", lane_size);
5425       break;
5426     }
5427   }
5428 }
5429 
5430 //
5431 // Following is lookup table based popcount computation algorithm:-
5432 //       Index   Bit set count
5433 //     [ 0000 ->   0,
5434 //       0001 ->   1,
5435 //       0010 ->   1,
5436 //       0011 ->   2,
5437 //       0100 ->   1,
5438 //       0101 ->   2,
5439 //       0110 ->   2,
5440 //       0111 ->   3,
5441 //       1000 ->   1,
5442 //       1001 ->   2,
5443 //       1010 ->   3,
5444 //       1011 ->   3,
5445 //       1100 ->   2,
5446 //       1101 ->   3,
5447 //       1111 ->   4 ]
5448 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5449 //     shuffle indices for lookup table access.
5450 //  b. Right shift each byte of vector lane by 4 positions.
5451 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5452 //     shuffle indices for lookup table access.
5453 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5454 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5455 //     count of all the bytes of a quadword.
5456 //  f. Perform step e. for upper 128bit vector lane.
5457 //  g. Pack the bitset count of quadwords back to double word.
5458 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5459 
5460 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5461                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5462   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5463   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5464   vpsrlw(dst, src, 4, vec_enc);
5465   vpand(dst, dst, xtmp1, vec_enc);
5466   vpand(xtmp1, src, xtmp1, vec_enc);
5467   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5468   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5469   vpshufb(dst, xtmp2, dst, vec_enc);
5470   vpaddb(dst, dst, xtmp1, vec_enc);
5471 }
5472 
5473 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5474                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5475   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5476   // Following code is as per steps e,f,g and h of above algorithm.
5477   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5478   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5479   vpsadbw(dst, dst, xtmp2, vec_enc);
5480   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5481   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5482   vpackuswb(dst, xtmp1, dst, vec_enc);
5483 }
5484 
5485 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5486                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5487   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5488   // Add the popcount of upper and lower bytes of word.
5489   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5490   vpsrlw(dst, xtmp1, 8, vec_enc);
5491   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5492   vpaddw(dst, dst, xtmp1, vec_enc);
5493 }
5494 
5495 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5496                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5497   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5498   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5499   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5500 }
5501 
5502 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5503                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5504   switch(bt) {
5505     case T_LONG:
5506       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5507       break;
5508     case T_INT:
5509       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5510       break;
5511     case T_CHAR:
5512     case T_SHORT:
5513       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5514       break;
5515     case T_BYTE:
5516     case T_BOOLEAN:
5517       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5518       break;
5519     default:
5520       fatal("Unsupported type %s", type2name(bt));
5521       break;
5522   }
5523 }
5524 
5525 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5526                                                       KRegister mask, bool merge, int vec_enc) {
5527   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5528   switch(bt) {
5529     case T_LONG:
5530       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5531       evpopcntq(dst, mask, src, merge, vec_enc);
5532       break;
5533     case T_INT:
5534       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5535       evpopcntd(dst, mask, src, merge, vec_enc);
5536       break;
5537     case T_CHAR:
5538     case T_SHORT:
5539       assert(VM_Version::supports_avx512_bitalg(), "");
5540       evpopcntw(dst, mask, src, merge, vec_enc);
5541       break;
5542     case T_BYTE:
5543     case T_BOOLEAN:
5544       assert(VM_Version::supports_avx512_bitalg(), "");
5545       evpopcntb(dst, mask, src, merge, vec_enc);
5546       break;
5547     default:
5548       fatal("Unsupported type %s", type2name(bt));
5549       break;
5550   }
5551 }
5552 
5553 #ifndef _LP64
5554 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
5555   assert(VM_Version::supports_avx512bw(), "");
5556   kmovdl(tmp, src);
5557   kunpckdql(dst, tmp, tmp);
5558 }
5559 #endif
5560 
5561 // Bit reversal algorithm first reverses the bits of each byte followed by
5562 // a byte level reversal for multi-byte primitive types (short/int/long).
5563 // Algorithm performs a lookup table access to get reverse bit sequence
5564 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5565 // is obtained by swapping the reverse bit sequences of upper and lower
5566 // nibble of a byte.
5567 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5568                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5569   if (VM_Version::supports_avx512vlbw()) {
5570 
5571     // Get the reverse bit sequence of lower nibble of each byte.
5572     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5573     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5574     evpandq(dst, xtmp2, src, vec_enc);
5575     vpshufb(dst, xtmp1, dst, vec_enc);
5576     vpsllq(dst, dst, 4, vec_enc);
5577 
5578     // Get the reverse bit sequence of upper nibble of each byte.
5579     vpandn(xtmp2, xtmp2, src, vec_enc);
5580     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5581     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5582 
5583     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5584     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5585     evporq(xtmp2, dst, xtmp2, vec_enc);
5586     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5587 
5588   } else if(vec_enc == Assembler::AVX_512bit) {
5589     // Shift based bit reversal.
5590     assert(bt == T_LONG || bt == T_INT, "");
5591 
5592     // Swap lower and upper nibble of each byte.
5593     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5594 
5595     // Swap two least and most significant bits of each nibble.
5596     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5597 
5598     // Swap adjacent pair of bits.
5599     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5600     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5601 
5602     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5603     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5604   } else {
5605     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5606     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5607 
5608     // Get the reverse bit sequence of lower nibble of each byte.
5609     vpand(dst, xtmp2, src, vec_enc);
5610     vpshufb(dst, xtmp1, dst, vec_enc);
5611     vpsllq(dst, dst, 4, vec_enc);
5612 
5613     // Get the reverse bit sequence of upper nibble of each byte.
5614     vpandn(xtmp2, xtmp2, src, vec_enc);
5615     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5616     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5617 
5618     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5619     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5620     vpor(xtmp2, dst, xtmp2, vec_enc);
5621     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5622   }
5623 }
5624 
5625 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5626                                                 XMMRegister xtmp, Register rscratch) {
5627   assert(VM_Version::supports_gfni(), "");
5628   assert(rscratch != noreg || always_reachable(mask), "missing");
5629 
5630   // Galois field instruction based bit reversal based on following algorithm.
5631   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5632   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5633   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5634   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5635 }
5636 
5637 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5638                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5639   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5640   evpandq(dst, xtmp1, src, vec_enc);
5641   vpsllq(dst, dst, nbits, vec_enc);
5642   vpandn(xtmp1, xtmp1, src, vec_enc);
5643   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5644   evporq(dst, dst, xtmp1, vec_enc);
5645 }
5646 
5647 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5648                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5649   // Shift based bit reversal.
5650   assert(VM_Version::supports_evex(), "");
5651   switch(bt) {
5652     case T_LONG:
5653       // Swap upper and lower double word of each quad word.
5654       evprorq(xtmp1, k0, src, 32, true, vec_enc);
5655       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5656       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5657       break;
5658     case T_INT:
5659       // Swap upper and lower word of each double word.
5660       evprord(xtmp1, k0, src, 16, true, vec_enc);
5661       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5662       break;
5663     case T_CHAR:
5664     case T_SHORT:
5665       // Swap upper and lower byte of each word.
5666       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
5667       break;
5668     case T_BYTE:
5669       evmovdquq(dst, k0, src, true, vec_enc);
5670       break;
5671     default:
5672       fatal("Unsupported type %s", type2name(bt));
5673       break;
5674   }
5675 }
5676 
5677 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5678   if (bt == T_BYTE) {
5679     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
5680       evmovdquq(dst, k0, src, true, vec_enc);
5681     } else {
5682       vmovdqu(dst, src);
5683     }
5684     return;
5685   }
5686   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
5687   // pre-computed shuffle indices.
5688   switch(bt) {
5689     case T_LONG:
5690       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
5691       break;
5692     case T_INT:
5693       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
5694       break;
5695     case T_CHAR:
5696     case T_SHORT:
5697       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
5698       break;
5699     default:
5700       fatal("Unsupported type %s", type2name(bt));
5701       break;
5702   }
5703   vpshufb(dst, src, dst, vec_enc);
5704 }
5705 
5706 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5707                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5708                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
5709   assert(is_integral_type(bt), "");
5710   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5711   assert(VM_Version::supports_avx512cd(), "");
5712   switch(bt) {
5713     case T_LONG:
5714       evplzcntq(dst, ktmp, src, merge, vec_enc);
5715       break;
5716     case T_INT:
5717       evplzcntd(dst, ktmp, src, merge, vec_enc);
5718       break;
5719     case T_SHORT:
5720       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
5721       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
5722       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
5723       vpunpckhwd(dst, xtmp1, src, vec_enc);
5724       evplzcntd(dst, ktmp, dst, merge, vec_enc);
5725       vpackusdw(dst, xtmp2, dst, vec_enc);
5726       break;
5727     case T_BYTE:
5728       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5729       // accessing the lookup table.
5730       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5731       // accessing the lookup table.
5732       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5733       assert(VM_Version::supports_avx512bw(), "");
5734       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
5735       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
5736       vpand(xtmp2, dst, src, vec_enc);
5737       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5738       vpsrlw(xtmp3, src, 4, vec_enc);
5739       vpand(xtmp3, dst, xtmp3, vec_enc);
5740       vpshufb(dst, xtmp1, xtmp3, vec_enc);
5741       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5742       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
5743       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
5744       break;
5745     default:
5746       fatal("Unsupported type %s", type2name(bt));
5747       break;
5748   }
5749 }
5750 
5751 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5752                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5753   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
5754   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5755   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5756   // accessing the lookup table.
5757   vpand(dst, xtmp2, src, vec_enc);
5758   vpshufb(dst, xtmp1, dst, vec_enc);
5759   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5760   // accessing the lookup table.
5761   vpsrlw(xtmp3, src, 4, vec_enc);
5762   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
5763   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
5764   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5765   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5766   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
5767   vpaddb(dst, dst, xtmp2, vec_enc);
5768   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
5769 }
5770 
5771 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5772                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5773   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5774   // Add zero counts of lower byte and upper byte of a word if
5775   // upper byte holds a zero value.
5776   vpsrlw(xtmp3, src, 8, vec_enc);
5777   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
5778   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
5779   vpsllw(xtmp2, dst, 8, vec_enc);
5780   vpaddw(xtmp2, xtmp2, dst, vec_enc);
5781   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5782   vpsrlw(dst, dst, 8, vec_enc);
5783 }
5784 
5785 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5786                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
5787   // Since IEEE 754 floating point format represents mantissa in 1.0 format
5788   // hence biased exponent can be used to compute leading zero count as per
5789   // following formula:-
5790   // LZCNT = 32 - (biased_exp - 127)
5791   // Special handling has been introduced for Zero, Max_Int and -ve source values.
5792 
5793   // Broadcast 0xFF
5794   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
5795   vpsrld(xtmp1, xtmp1, 24, vec_enc);
5796 
5797   // Extract biased exponent.
5798   vcvtdq2ps(dst, src, vec_enc);
5799   vpsrld(dst, dst, 23, vec_enc);
5800   vpand(dst, dst, xtmp1, vec_enc);
5801 
5802   // Broadcast 127.
5803   vpsrld(xtmp1, xtmp1, 1, vec_enc);
5804   // Exponent = biased_exp - 127
5805   vpsubd(dst, dst, xtmp1, vec_enc);
5806 
5807   // Exponent = Exponent  + 1
5808   vpsrld(xtmp3, xtmp1, 6, vec_enc);
5809   vpaddd(dst, dst, xtmp3, vec_enc);
5810 
5811   // Replace -ve exponent with zero, exponent is -ve when src
5812   // lane contains a zero value.
5813   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5814   vblendvps(dst, dst, xtmp2, dst, vec_enc);
5815 
5816   // Rematerialize broadcast 32.
5817   vpslld(xtmp1, xtmp3, 5, vec_enc);
5818   // Exponent is 32 if corresponding source lane contains max_int value.
5819   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
5820   // LZCNT = 32 - exponent
5821   vpsubd(dst, xtmp1, dst, vec_enc);
5822 
5823   // Replace LZCNT with a value 1 if corresponding source lane
5824   // contains max_int value.
5825   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
5826 
5827   // Replace biased_exp with 0 if source lane value is less than zero.
5828   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5829   vblendvps(dst, dst, xtmp2, src, vec_enc);
5830 }
5831 
5832 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5833                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5834   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5835   // Add zero counts of lower word and upper word of a double word if
5836   // upper word holds a zero value.
5837   vpsrld(xtmp3, src, 16, vec_enc);
5838   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
5839   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
5840   vpslld(xtmp2, dst, 16, vec_enc);
5841   vpaddd(xtmp2, xtmp2, dst, vec_enc);
5842   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5843   vpsrld(dst, dst, 16, vec_enc);
5844   // Add zero counts of lower doubleword and upper doubleword of a
5845   // quadword if upper doubleword holds a zero value.
5846   vpsrlq(xtmp3, src, 32, vec_enc);
5847   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
5848   vpsllq(xtmp2, dst, 32, vec_enc);
5849   vpaddq(xtmp2, xtmp2, dst, vec_enc);
5850   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5851   vpsrlq(dst, dst, 32, vec_enc);
5852 }
5853 
5854 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
5855                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5856                                                        Register rtmp, int vec_enc) {
5857   assert(is_integral_type(bt), "unexpected type");
5858   assert(vec_enc < Assembler::AVX_512bit, "");
5859   switch(bt) {
5860     case T_LONG:
5861       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5862       break;
5863     case T_INT:
5864       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
5865       break;
5866     case T_SHORT:
5867       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5868       break;
5869     case T_BYTE:
5870       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5871       break;
5872     default:
5873       fatal("Unsupported type %s", type2name(bt));
5874       break;
5875   }
5876 }
5877 
5878 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
5879   switch(bt) {
5880     case T_BYTE:
5881       vpsubb(dst, src1, src2, vec_enc);
5882       break;
5883     case T_SHORT:
5884       vpsubw(dst, src1, src2, vec_enc);
5885       break;
5886     case T_INT:
5887       vpsubd(dst, src1, src2, vec_enc);
5888       break;
5889     case T_LONG:
5890       vpsubq(dst, src1, src2, vec_enc);
5891       break;
5892     default:
5893       fatal("Unsupported type %s", type2name(bt));
5894       break;
5895   }
5896 }
5897 
5898 // Trailing zero count computation is based on leading zero count operation as per
5899 // following equation. All AVX3 targets support AVX512CD feature which offers
5900 // direct vector instruction to compute leading zero count.
5901 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
5902 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5903                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5904                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
5905   assert(is_integral_type(bt), "");
5906   // xtmp = -1
5907   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
5908   // xtmp = xtmp + src
5909   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
5910   // xtmp = xtmp & ~src
5911   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
5912   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
5913   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
5914   vpsub(bt, dst, xtmp4, dst, vec_enc);
5915 }
5916 
5917 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
5918 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
5919 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5920                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5921   assert(is_integral_type(bt), "");
5922   // xtmp = 0
5923   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
5924   // xtmp = 0 - src
5925   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
5926   // xtmp = xtmp | src
5927   vpor(xtmp3, xtmp3, src, vec_enc);
5928   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
5929   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
5930   vpsub(bt, dst, xtmp1, dst, vec_enc);
5931 }
5932 
5933 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
5934   Label done;
5935   Label neg_divisor_fastpath;
5936   cmpl(divisor, 0);
5937   jccb(Assembler::less, neg_divisor_fastpath);
5938   xorl(rdx, rdx);
5939   divl(divisor);
5940   jmpb(done);
5941   bind(neg_divisor_fastpath);
5942   // Fastpath for divisor < 0:
5943   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
5944   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
5945   movl(rdx, rax);
5946   subl(rdx, divisor);
5947   if (VM_Version::supports_bmi1()) {
5948     andnl(rax, rdx, rax);
5949   } else {
5950     notl(rdx);
5951     andl(rax, rdx);
5952   }
5953   shrl(rax, 31);
5954   bind(done);
5955 }
5956 
5957 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
5958   Label done;
5959   Label neg_divisor_fastpath;
5960   cmpl(divisor, 0);
5961   jccb(Assembler::less, neg_divisor_fastpath);
5962   xorl(rdx, rdx);
5963   divl(divisor);
5964   jmpb(done);
5965   bind(neg_divisor_fastpath);
5966   // Fastpath when divisor < 0:
5967   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
5968   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
5969   movl(rdx, rax);
5970   subl(rax, divisor);
5971   if (VM_Version::supports_bmi1()) {
5972     andnl(rax, rax, rdx);
5973   } else {
5974     notl(rax);
5975     andl(rax, rdx);
5976   }
5977   sarl(rax, 31);
5978   andl(rax, divisor);
5979   subl(rdx, rax);
5980   bind(done);
5981 }
5982 
5983 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
5984   Label done;
5985   Label neg_divisor_fastpath;
5986 
5987   cmpl(divisor, 0);
5988   jccb(Assembler::less, neg_divisor_fastpath);
5989   xorl(rdx, rdx);
5990   divl(divisor);
5991   jmpb(done);
5992   bind(neg_divisor_fastpath);
5993   // Fastpath for divisor < 0:
5994   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
5995   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
5996   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
5997   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
5998   movl(rdx, rax);
5999   subl(rax, divisor);
6000   if (VM_Version::supports_bmi1()) {
6001     andnl(rax, rax, rdx);
6002   } else {
6003     notl(rax);
6004     andl(rax, rdx);
6005   }
6006   movl(tmp, rax);
6007   shrl(rax, 31); // quotient
6008   sarl(tmp, 31);
6009   andl(tmp, divisor);
6010   subl(rdx, tmp); // remainder
6011   bind(done);
6012 }
6013 
6014 #ifdef _LP64
6015 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6016                                  XMMRegister xtmp2, Register rtmp) {
6017   if(VM_Version::supports_gfni()) {
6018     // Galois field instruction based bit reversal based on following algorithm.
6019     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6020     mov64(rtmp, 0x8040201008040201L);
6021     movq(xtmp1, src);
6022     movq(xtmp2, rtmp);
6023     gf2p8affineqb(xtmp1, xtmp2, 0);
6024     movq(dst, xtmp1);
6025   } else {
6026     // Swap even and odd numbered bits.
6027     movl(rtmp, src);
6028     andl(rtmp, 0x55555555);
6029     shll(rtmp, 1);
6030     movl(dst, src);
6031     andl(dst, 0xAAAAAAAA);
6032     shrl(dst, 1);
6033     orl(dst, rtmp);
6034 
6035     // Swap LSB and MSB 2 bits of each nibble.
6036     movl(rtmp, dst);
6037     andl(rtmp, 0x33333333);
6038     shll(rtmp, 2);
6039     andl(dst, 0xCCCCCCCC);
6040     shrl(dst, 2);
6041     orl(dst, rtmp);
6042 
6043     // Swap LSB and MSB 4 bits of each byte.
6044     movl(rtmp, dst);
6045     andl(rtmp, 0x0F0F0F0F);
6046     shll(rtmp, 4);
6047     andl(dst, 0xF0F0F0F0);
6048     shrl(dst, 4);
6049     orl(dst, rtmp);
6050   }
6051   bswapl(dst);
6052 }
6053 
6054 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6055                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6056   if(VM_Version::supports_gfni()) {
6057     // Galois field instruction based bit reversal based on following algorithm.
6058     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6059     mov64(rtmp1, 0x8040201008040201L);
6060     movq(xtmp1, src);
6061     movq(xtmp2, rtmp1);
6062     gf2p8affineqb(xtmp1, xtmp2, 0);
6063     movq(dst, xtmp1);
6064   } else {
6065     // Swap even and odd numbered bits.
6066     movq(rtmp1, src);
6067     mov64(rtmp2, 0x5555555555555555L);
6068     andq(rtmp1, rtmp2);
6069     shlq(rtmp1, 1);
6070     movq(dst, src);
6071     notq(rtmp2);
6072     andq(dst, rtmp2);
6073     shrq(dst, 1);
6074     orq(dst, rtmp1);
6075 
6076     // Swap LSB and MSB 2 bits of each nibble.
6077     movq(rtmp1, dst);
6078     mov64(rtmp2, 0x3333333333333333L);
6079     andq(rtmp1, rtmp2);
6080     shlq(rtmp1, 2);
6081     notq(rtmp2);
6082     andq(dst, rtmp2);
6083     shrq(dst, 2);
6084     orq(dst, rtmp1);
6085 
6086     // Swap LSB and MSB 4 bits of each byte.
6087     movq(rtmp1, dst);
6088     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6089     andq(rtmp1, rtmp2);
6090     shlq(rtmp1, 4);
6091     notq(rtmp2);
6092     andq(dst, rtmp2);
6093     shrq(dst, 4);
6094     orq(dst, rtmp1);
6095   }
6096   bswapq(dst);
6097 }
6098 
6099 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6100   Label done;
6101   Label neg_divisor_fastpath;
6102   cmpq(divisor, 0);
6103   jccb(Assembler::less, neg_divisor_fastpath);
6104   xorl(rdx, rdx);
6105   divq(divisor);
6106   jmpb(done);
6107   bind(neg_divisor_fastpath);
6108   // Fastpath for divisor < 0:
6109   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6110   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6111   movq(rdx, rax);
6112   subq(rdx, divisor);
6113   if (VM_Version::supports_bmi1()) {
6114     andnq(rax, rdx, rax);
6115   } else {
6116     notq(rdx);
6117     andq(rax, rdx);
6118   }
6119   shrq(rax, 63);
6120   bind(done);
6121 }
6122 
6123 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6124   Label done;
6125   Label neg_divisor_fastpath;
6126   cmpq(divisor, 0);
6127   jccb(Assembler::less, neg_divisor_fastpath);
6128   xorq(rdx, rdx);
6129   divq(divisor);
6130   jmp(done);
6131   bind(neg_divisor_fastpath);
6132   // Fastpath when divisor < 0:
6133   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6134   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6135   movq(rdx, rax);
6136   subq(rax, divisor);
6137   if (VM_Version::supports_bmi1()) {
6138     andnq(rax, rax, rdx);
6139   } else {
6140     notq(rax);
6141     andq(rax, rdx);
6142   }
6143   sarq(rax, 63);
6144   andq(rax, divisor);
6145   subq(rdx, rax);
6146   bind(done);
6147 }
6148 
6149 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6150   Label done;
6151   Label neg_divisor_fastpath;
6152   cmpq(divisor, 0);
6153   jccb(Assembler::less, neg_divisor_fastpath);
6154   xorq(rdx, rdx);
6155   divq(divisor);
6156   jmp(done);
6157   bind(neg_divisor_fastpath);
6158   // Fastpath for divisor < 0:
6159   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6160   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6161   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6162   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6163   movq(rdx, rax);
6164   subq(rax, divisor);
6165   if (VM_Version::supports_bmi1()) {
6166     andnq(rax, rax, rdx);
6167   } else {
6168     notq(rax);
6169     andq(rax, rdx);
6170   }
6171   movq(tmp, rax);
6172   shrq(rax, 63); // quotient
6173   sarq(tmp, 63);
6174   andq(tmp, divisor);
6175   subq(rdx, tmp); // remainder
6176   bind(done);
6177 }
6178 #endif
6179 
6180 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6181                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6182                                         int vlen_enc) {
6183   assert(VM_Version::supports_avx512bw(), "");
6184   // Byte shuffles are inlane operations and indices are determined using
6185   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6186   // normalized to index range 0-15. This makes sure that all the multiples
6187   // of an index value are placed at same relative position in 128 bit
6188   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6189   // will be 16th element in their respective 128 bit lanes.
6190   movl(rtmp, 16);
6191   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6192 
6193   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6194   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6195   // original shuffle indices and move the shuffled lanes corresponding to true
6196   // mask to destination vector.
6197   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6198   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6199   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6200 
6201   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6202   // and broadcasting second 128 bit lane.
6203   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6204   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6205   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6206   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6207   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6208 
6209   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6210   // and broadcasting third 128 bit lane.
6211   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6212   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6213   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6214   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6215   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6216 
6217   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6218   // and broadcasting third 128 bit lane.
6219   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6220   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6221   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6222   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6223   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6224 }
6225 
6226 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6227                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6228   if (vlen_enc == AVX_128bit) {
6229     vpermilps(dst, src, shuffle, vlen_enc);
6230   } else if (bt == T_INT) {
6231     vpermd(dst, shuffle, src, vlen_enc);
6232   } else {
6233     assert(bt == T_FLOAT, "");
6234     vpermps(dst, shuffle, src, vlen_enc);
6235   }
6236 }