1 /*
   2  * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 
  39 #ifdef PRODUCT
  40 #define BLOCK_COMMENT(str) /* nothing */
  41 #define STOP(error) stop(error)
  42 #else
  43 #define BLOCK_COMMENT(str) block_comment(str)
  44 #define STOP(error) block_comment(error); stop(error)
  45 #endif
  46 
  47 // C2 compiled method's prolog code.
  48 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub, int max_monitors) {
  49 
  50   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  51   // NativeJump::patch_verified_entry will be able to patch out the entry
  52   // code safely. The push to verify stack depth is ok at 5 bytes,
  53   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  54   // stack bang then we must use the 6 byte frame allocation even if
  55   // we have no frame. :-(
  56   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  57 
  58   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  59   // Remove word for return addr
  60   framesize -= wordSize;
  61   stack_bang_size -= wordSize;
  62 
  63   // Calls to C2R adapters often do not accept exceptional returns.
  64   // We require that their callers must bang for them.  But be careful, because
  65   // some VM calls (such as call site linkage) can use several kilobytes of
  66   // stack.  But the stack safety zone should account for that.
  67   // See bugs 4446381, 4468289, 4497237.
  68   if (stack_bang_size > 0) {
  69     generate_stack_overflow_check(stack_bang_size);
  70 
  71     // We always push rbp, so that on return to interpreter rbp, will be
  72     // restored correctly and we can correct the stack.
  73     push(rbp);
  74     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  75     if (PreserveFramePointer) {
  76       mov(rbp, rsp);
  77     }
  78     // Remove word for ebp
  79     framesize -= wordSize;
  80 
  81     // Create frame
  82     if (framesize) {
  83       subptr(rsp, framesize);
  84     }
  85   } else {
  86     // Create frame (force generation of a 4 byte immediate value)
  87     subptr_imm32(rsp, framesize);
  88 
  89     // Save RBP register now.
  90     framesize -= wordSize;
  91     movptr(Address(rsp, framesize), rbp);
  92     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  93     if (PreserveFramePointer) {
  94       movptr(rbp, rsp);
  95       if (framesize > 0) {
  96         addptr(rbp, framesize);
  97       }
  98     }
  99   }
 100 
 101   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 102     framesize -= wordSize;
 103     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 104   }
 105 
 106 #ifndef _LP64
 107   // If method sets FPU control word do it now
 108   if (fp_mode_24b) {
 109     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 110   }
 111   if (UseSSE >= 2 && VerifyFPU) {
 112     verify_FPU(0, "FPU stack must be clean on entry");
 113   }
 114 #endif
 115 
 116 #ifdef ASSERT
 117   if (VerifyStackAtCalls) {
 118     Label L;
 119     push(rax);
 120     mov(rax, rsp);
 121     andptr(rax, StackAlignmentInBytes-1);
 122     cmpptr(rax, StackAlignmentInBytes-wordSize);
 123     pop(rax);
 124     jcc(Assembler::equal, L);
 125     STOP("Stack is not properly aligned!");
 126     bind(L);
 127   }
 128 #endif
 129 
 130 #ifdef _LP64
 131   if (UseFastLocking && max_monitors > 0) {
 132     C2CheckLockStackStub* stub = new (Compile::current()->comp_arena()) C2CheckLockStackStub();
 133     Compile::current()->output()->add_stub(stub);
 134     assert(!is_stub, "only methods have monitors");
 135     Register thread = r15_thread;
 136     movptr(rax, Address(thread, JavaThread::lock_stack_current_offset()));
 137     addptr(rax, max_monitors * oopSize);
 138     cmpptr(rax, Address(thread, JavaThread::lock_stack_limit_offset()));
 139     jcc(Assembler::greaterEqual, stub->entry());
 140     bind(stub->continuation());
 141   }
 142 #endif
 143 
 144   if (!is_stub) {
 145     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 146  #ifdef _LP64
 147     if (BarrierSet::barrier_set()->barrier_set_nmethod() != NULL) {
 148       // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 149       Label dummy_slow_path;
 150       Label dummy_continuation;
 151       Label* slow_path = &dummy_slow_path;
 152       Label* continuation = &dummy_continuation;
 153       if (!Compile::current()->output()->in_scratch_emit_size()) {
 154         // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 155         C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 156         Compile::current()->output()->add_stub(stub);
 157         slow_path = &stub->entry();
 158         continuation = &stub->continuation();
 159       }
 160       bs->nmethod_entry_barrier(this, slow_path, continuation);
 161     }
 162 #else
 163     // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 164     bs->nmethod_entry_barrier(this, NULL /* slow_path */, NULL /* continuation */);
 165 #endif
 166   }
 167 }
 168 
 169 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 170   switch (vlen_in_bytes) {
 171     case  4: // fall-through
 172     case  8: // fall-through
 173     case 16: return Assembler::AVX_128bit;
 174     case 32: return Assembler::AVX_256bit;
 175     case 64: return Assembler::AVX_512bit;
 176 
 177     default: {
 178       ShouldNotReachHere();
 179       return Assembler::AVX_NoVec;
 180     }
 181   }
 182 }
 183 
 184 #if INCLUDE_RTM_OPT
 185 
 186 // Update rtm_counters based on abort status
 187 // input: abort_status
 188 //        rtm_counters (RTMLockingCounters*)
 189 // flags are killed
 190 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
 191 
 192   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
 193   if (PrintPreciseRTMLockingStatistics) {
 194     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
 195       Label check_abort;
 196       testl(abort_status, (1<<i));
 197       jccb(Assembler::equal, check_abort);
 198       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
 199       bind(check_abort);
 200     }
 201   }
 202 }
 203 
 204 // Branch if (random & (count-1) != 0), count is 2^n
 205 // tmp, scr and flags are killed
 206 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
 207   assert(tmp == rax, "");
 208   assert(scr == rdx, "");
 209   rdtsc(); // modifies EDX:EAX
 210   andptr(tmp, count-1);
 211   jccb(Assembler::notZero, brLabel);
 212 }
 213 
 214 // Perform abort ratio calculation, set no_rtm bit if high ratio
 215 // input:  rtm_counters_Reg (RTMLockingCounters* address)
 216 // tmpReg, rtm_counters_Reg and flags are killed
 217 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
 218                                                     Register rtm_counters_Reg,
 219                                                     RTMLockingCounters* rtm_counters,
 220                                                     Metadata* method_data) {
 221   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 222 
 223   if (RTMLockingCalculationDelay > 0) {
 224     // Delay calculation
 225     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()));
 226     testptr(tmpReg, tmpReg);
 227     jccb(Assembler::equal, L_done);
 228   }
 229   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 230   //   Aborted transactions = abort_count * 100
 231   //   All transactions = total_count *  RTMTotalCountIncrRate
 232   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 233 
 234   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 235   cmpptr(tmpReg, RTMAbortThreshold);
 236   jccb(Assembler::below, L_check_always_rtm2);
 237   imulptr(tmpReg, tmpReg, 100);
 238 
 239   Register scrReg = rtm_counters_Reg;
 240   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 241   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 242   imulptr(scrReg, scrReg, RTMAbortRatio);
 243   cmpptr(tmpReg, scrReg);
 244   jccb(Assembler::below, L_check_always_rtm1);
 245   if (method_data != NULL) {
 246     // set rtm_state to "no rtm" in MDO
 247     mov_metadata(tmpReg, method_data);
 248     lock();
 249     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
 250   }
 251   jmpb(L_done);
 252   bind(L_check_always_rtm1);
 253   // Reload RTMLockingCounters* address
 254   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 255   bind(L_check_always_rtm2);
 256   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 257   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 258   jccb(Assembler::below, L_done);
 259   if (method_data != NULL) {
 260     // set rtm_state to "always rtm" in MDO
 261     mov_metadata(tmpReg, method_data);
 262     lock();
 263     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
 264   }
 265   bind(L_done);
 266 }
 267 
 268 // Update counters and perform abort ratio calculation
 269 // input:  abort_status_Reg
 270 // rtm_counters_Reg, flags are killed
 271 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 272                                       Register rtm_counters_Reg,
 273                                       RTMLockingCounters* rtm_counters,
 274                                       Metadata* method_data,
 275                                       bool profile_rtm) {
 276 
 277   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 278   // update rtm counters based on rax value at abort
 279   // reads abort_status_Reg, updates flags
 280   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 281   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 282   if (profile_rtm) {
 283     // Save abort status because abort_status_Reg is used by following code.
 284     if (RTMRetryCount > 0) {
 285       push(abort_status_Reg);
 286     }
 287     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 288     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 289     // restore abort status
 290     if (RTMRetryCount > 0) {
 291       pop(abort_status_Reg);
 292     }
 293   }
 294 }
 295 
 296 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 297 // inputs: retry_count_Reg
 298 //       : abort_status_Reg
 299 // output: retry_count_Reg decremented by 1
 300 // flags are killed
 301 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 302   Label doneRetry;
 303   assert(abort_status_Reg == rax, "");
 304   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 305   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 306   // if reason is in 0x6 and retry count != 0 then retry
 307   andptr(abort_status_Reg, 0x6);
 308   jccb(Assembler::zero, doneRetry);
 309   testl(retry_count_Reg, retry_count_Reg);
 310   jccb(Assembler::zero, doneRetry);
 311   pause();
 312   decrementl(retry_count_Reg);
 313   jmp(retryLabel);
 314   bind(doneRetry);
 315 }
 316 
 317 // Spin and retry if lock is busy,
 318 // inputs: box_Reg (monitor address)
 319 //       : retry_count_Reg
 320 // output: retry_count_Reg decremented by 1
 321 //       : clear z flag if retry count exceeded
 322 // tmp_Reg, scr_Reg, flags are killed
 323 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 324                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 325   Label SpinLoop, SpinExit, doneRetry;
 326   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 327 
 328   testl(retry_count_Reg, retry_count_Reg);
 329   jccb(Assembler::zero, doneRetry);
 330   decrementl(retry_count_Reg);
 331   movptr(scr_Reg, RTMSpinLoopCount);
 332 
 333   bind(SpinLoop);
 334   pause();
 335   decrementl(scr_Reg);
 336   jccb(Assembler::lessEqual, SpinExit);
 337   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 338   testptr(tmp_Reg, tmp_Reg);
 339   jccb(Assembler::notZero, SpinLoop);
 340 
 341   bind(SpinExit);
 342   jmp(retryLabel);
 343   bind(doneRetry);
 344   incrementl(retry_count_Reg); // clear z flag
 345 }
 346 
 347 // Use RTM for normal stack locks
 348 // Input: objReg (object to lock)
 349 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 350                                          Register retry_on_abort_count_Reg,
 351                                          RTMLockingCounters* stack_rtm_counters,
 352                                          Metadata* method_data, bool profile_rtm,
 353                                          Label& DONE_LABEL, Label& IsInflated) {
 354   assert(UseRTMForStackLocks, "why call this otherwise?");
 355   assert(tmpReg == rax, "");
 356   assert(scrReg == rdx, "");
 357   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 358 
 359   if (RTMRetryCount > 0) {
 360     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 361     bind(L_rtm_retry);
 362   }
 363   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 364   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral
 365   jcc(Assembler::notZero, IsInflated);
 366 
 367   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 368     Label L_noincrement;
 369     if (RTMTotalCountIncrRate > 1) {
 370       // tmpReg, scrReg and flags are killed
 371       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 372     }
 373     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
 374     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 375     bind(L_noincrement);
 376   }
 377   xbegin(L_on_abort);
 378   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 379   andptr(tmpReg, markWord::lock_mask_in_place);     // look at 2 lock bits
 380   cmpptr(tmpReg, markWord::unlocked_value);         // bits = 01 unlocked
 381   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 382 
 383   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 384   if (UseRTMXendForLockBusy) {
 385     xend();
 386     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 387     jmp(L_decrement_retry);
 388   }
 389   else {
 390     xabort(0);
 391   }
 392   bind(L_on_abort);
 393   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 394     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 395   }
 396   bind(L_decrement_retry);
 397   if (RTMRetryCount > 0) {
 398     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 399     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 400   }
 401 }
 402 
 403 // Use RTM for inflating locks
 404 // inputs: objReg (object to lock)
 405 //         boxReg (on-stack box address (displaced header location) - KILLED)
 406 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 407 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 408                                             Register scrReg, Register retry_on_busy_count_Reg,
 409                                             Register retry_on_abort_count_Reg,
 410                                             RTMLockingCounters* rtm_counters,
 411                                             Metadata* method_data, bool profile_rtm,
 412                                             Label& DONE_LABEL) {
 413   assert(UseRTMLocking, "why call this otherwise?");
 414   assert(tmpReg == rax, "");
 415   assert(scrReg == rdx, "");
 416   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 417   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 418 
 419   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 420   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 421 
 422   if (RTMRetryCount > 0) {
 423     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 424     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 425     bind(L_rtm_retry);
 426   }
 427   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 428     Label L_noincrement;
 429     if (RTMTotalCountIncrRate > 1) {
 430       // tmpReg, scrReg and flags are killed
 431       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 432     }
 433     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 434     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 435     bind(L_noincrement);
 436   }
 437   xbegin(L_on_abort);
 438   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 439   movptr(tmpReg, Address(tmpReg, owner_offset));
 440   testptr(tmpReg, tmpReg);
 441   jcc(Assembler::zero, DONE_LABEL);
 442   if (UseRTMXendForLockBusy) {
 443     xend();
 444     jmp(L_decrement_retry);
 445   }
 446   else {
 447     xabort(0);
 448   }
 449   bind(L_on_abort);
 450   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 451   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 452     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 453   }
 454   if (RTMRetryCount > 0) {
 455     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 456     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 457   }
 458 
 459   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 460   testptr(tmpReg, tmpReg) ;
 461   jccb(Assembler::notZero, L_decrement_retry) ;
 462 
 463   // Appears unlocked - try to swing _owner from null to non-null.
 464   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 465 #ifdef _LP64
 466   Register threadReg = r15_thread;
 467 #else
 468   get_thread(scrReg);
 469   Register threadReg = scrReg;
 470 #endif
 471   lock();
 472   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 473 
 474   if (RTMRetryCount > 0) {
 475     // success done else retry
 476     jccb(Assembler::equal, DONE_LABEL) ;
 477     bind(L_decrement_retry);
 478     // Spin and retry if lock is busy.
 479     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 480   }
 481   else {
 482     bind(L_decrement_retry);
 483   }
 484 }
 485 
 486 #endif //  INCLUDE_RTM_OPT
 487 
 488 // fast_lock and fast_unlock used by C2
 489 
 490 // Because the transitions from emitted code to the runtime
 491 // monitorenter/exit helper stubs are so slow it's critical that
 492 // we inline both the stack-locking fast path and the inflated fast path.
 493 //
 494 // See also: cmpFastLock and cmpFastUnlock.
 495 //
 496 // What follows is a specialized inline transliteration of the code
 497 // in enter() and exit(). If we're concerned about I$ bloat another
 498 // option would be to emit TrySlowEnter and TrySlowExit methods
 499 // at startup-time.  These methods would accept arguments as
 500 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 501 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 502 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 503 // In practice, however, the # of lock sites is bounded and is usually small.
 504 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 505 // if the processor uses simple bimodal branch predictors keyed by EIP
 506 // Since the helper routines would be called from multiple synchronization
 507 // sites.
 508 //
 509 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 510 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 511 // to those specialized methods.  That'd give us a mostly platform-independent
 512 // implementation that the JITs could optimize and inline at their pleasure.
 513 // Done correctly, the only time we'd need to cross to native could would be
 514 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 515 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 516 // (b) explicit barriers or fence operations.
 517 //
 518 // TODO:
 519 //
 520 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 521 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 522 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 523 //    the lock operators would typically be faster than reifying Self.
 524 //
 525 // *  Ideally I'd define the primitives as:
 526 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 527 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 528 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 529 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 530 //    Furthermore the register assignments are overconstrained, possibly resulting in
 531 //    sub-optimal code near the synchronization site.
 532 //
 533 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 534 //    Alternately, use a better sp-proximity test.
 535 //
 536 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 537 //    Either one is sufficient to uniquely identify a thread.
 538 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 539 //
 540 // *  Intrinsify notify() and notifyAll() for the common cases where the
 541 //    object is locked by the calling thread but the waitlist is empty.
 542 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 543 //
 544 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 545 //    But beware of excessive branch density on AMD Opterons.
 546 //
 547 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 548 //    or failure of the fast path.  If the fast path fails then we pass
 549 //    control to the slow path, typically in C.  In fast_lock and
 550 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 551 //    will emit a conditional branch immediately after the node.
 552 //    So we have branches to branches and lots of ICC.ZF games.
 553 //    Instead, it might be better to have C2 pass a "FailureLabel"
 554 //    into fast_lock and fast_unlock.  In the case of success, control
 555 //    will drop through the node.  ICC.ZF is undefined at exit.
 556 //    In the case of failure, the node will branch directly to the
 557 //    FailureLabel
 558 
 559 
 560 // obj: object to lock
 561 // box: on-stack box address (displaced header location) - KILLED
 562 // rax,: tmp -- KILLED
 563 // scr: tmp -- KILLED
 564 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 565                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 566                                  RTMLockingCounters* rtm_counters,
 567                                  RTMLockingCounters* stack_rtm_counters,
 568                                  Metadata* method_data,
 569                                  bool use_rtm, bool profile_rtm) {
 570   // Ensure the register assignments are disjoint
 571   assert(tmpReg == rax, "");
 572 
 573   if (use_rtm) {
 574     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 575   } else {
 576     assert(cx1Reg == noreg, "");
 577     assert(cx2Reg == noreg, "");
 578     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 579   }
 580 
 581   // Possible cases that we'll encounter in fast_lock
 582   // ------------------------------------------------
 583   // * Inflated
 584   //    -- unlocked
 585   //    -- Locked
 586   //       = by self
 587   //       = by other
 588   // * neutral
 589   // * stack-locked
 590   //    -- by self
 591   //       = sp-proximity test hits
 592   //       = sp-proximity test generates false-negative
 593   //    -- by other
 594   //
 595 
 596   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 597 
 598   if (DiagnoseSyncOnValueBasedClasses != 0) {
 599     load_klass(tmpReg, objReg, scrReg);
 600     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 601     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 602     jcc(Assembler::notZero, DONE_LABEL);
 603   }
 604 
 605 #if INCLUDE_RTM_OPT
 606   if (UseRTMForStackLocks && use_rtm) {
 607     assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive");
 608     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 609                       stack_rtm_counters, method_data, profile_rtm,
 610                       DONE_LABEL, IsInflated);
 611   }
 612 #endif // INCLUDE_RTM_OPT
 613 
 614   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 615   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 616   jccb(Assembler::notZero, IsInflated);
 617 
 618   if (!UseHeavyMonitors) {
 619     if (UseFastLocking) {
 620 #ifdef _LP64
 621       fast_lock_impl(objReg, tmpReg, thread, scrReg, NO_COUNT, false);
 622       jmp(COUNT);
 623 #else
 624       // We can not emit the lock-stack-check in verified_entry() because we don't have enough
 625       // registers (for thread ptr). Therefore we have to emit the lock-stack-check in
 626       // fast_lock_impl(). However, that check can take a slow-path with ZF=1, therefore
 627       // we need to handle it specially and force ZF=0 before taking the actual slow-path.
 628       Label slow;
 629       fast_lock_impl(objReg, tmpReg, thread, scrReg, slow);
 630       jmp(COUNT);
 631       bind(slow);
 632       testptr(objReg, objReg); // ZF=0 to indicate failure
 633       jmp(NO_COUNT);
 634 #endif
 635     } else {
 636       // Attempt stack-locking ...
 637       orptr (tmpReg, markWord::unlocked_value);
 638       movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 639       lock();
 640       cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 641       jcc(Assembler::equal, COUNT);           // Success
 642 
 643       // Recursive locking.
 644       // The object is stack-locked: markword contains stack pointer to BasicLock.
 645       // Locked by current thread if difference with current SP is less than one page.
 646       subptr(tmpReg, rsp);
 647       // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 648       andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) );
 649       movptr(Address(boxReg, 0), tmpReg);
 650     }
 651   } else {
 652     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 653     testptr(objReg, objReg);
 654   }
 655   jmp(DONE_LABEL);
 656 
 657   bind(IsInflated);
 658   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 659 
 660 #if INCLUDE_RTM_OPT
 661   // Use the same RTM locking code in 32- and 64-bit VM.
 662   if (use_rtm) {
 663     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 664                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 665   } else {
 666 #endif // INCLUDE_RTM_OPT
 667 
 668 #ifndef _LP64
 669   // The object is inflated.
 670 
 671   // boxReg refers to the on-stack BasicLock in the current frame.
 672   // We'd like to write:
 673   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 674   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 675   // additional latency as we have another ST in the store buffer that must drain.
 676 
 677   // avoid ST-before-CAS
 678   // register juggle because we need tmpReg for cmpxchgptr below
 679   movptr(scrReg, boxReg);
 680   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 681 
 682   // Optimistic form: consider XORL tmpReg,tmpReg
 683   movptr(tmpReg, NULL_WORD);
 684 
 685   // Appears unlocked - try to swing _owner from null to non-null.
 686   // Ideally, I'd manifest "Self" with get_thread and then attempt
 687   // to CAS the register containing Self into m->Owner.
 688   // But we don't have enough registers, so instead we can either try to CAS
 689   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 690   // we later store "Self" into m->Owner.  Transiently storing a stack address
 691   // (rsp or the address of the box) into  m->owner is harmless.
 692   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 693   lock();
 694   cmpxchgptr(thread, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 695   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 696 
 697   // If the CAS fails we can either retry or pass control to the slow path.
 698   // We use the latter tactic.
 699   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 700   // If the CAS was successful ...
 701   //   Self has acquired the lock
 702   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 703   // Intentional fall-through into DONE_LABEL ...
 704 #else // _LP64
 705   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 706   movq(scrReg, tmpReg);
 707   xorq(tmpReg, tmpReg);
 708   lock();
 709   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 710   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 711   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 712   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 713   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 714   jccb(Assembler::equal, COUNT);          // CAS above succeeded; propagate ZF = 1 (success)
 715 
 716   cmpptr(r15_thread, rax);                // Check if we are already the owner (recursive lock)
 717   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 718   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 719   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 720 #endif // _LP64
 721 #if INCLUDE_RTM_OPT
 722   } // use_rtm()
 723 #endif
 724   bind(DONE_LABEL);
 725 
 726   // ZFlag == 1 count in fast path
 727   // ZFlag == 0 count in slow path
 728   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 729 
 730   bind(COUNT);
 731   // Count monitors in fast path
 732 #ifndef _LP64
 733   get_thread(tmpReg);
 734   incrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 735 #else // _LP64
 736   incrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 737 #endif
 738 
 739   xorl(tmpReg, tmpReg); // Set ZF == 1
 740 
 741   bind(NO_COUNT);
 742 
 743   // At NO_COUNT the icc ZFlag is set as follows ...
 744   // fast_unlock uses the same protocol.
 745   // ZFlag == 1 -> Success
 746   // ZFlag == 0 -> Failure - force control through the slow path
 747 }
 748 
 749 // obj: object to unlock
 750 // box: box address (displaced header location), killed.  Must be EAX.
 751 // tmp: killed, cannot be obj nor box.
 752 //
 753 // Some commentary on balanced locking:
 754 //
 755 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 756 // Methods that don't have provably balanced locking are forced to run in the
 757 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 758 // The interpreter provides two properties:
 759 // I1:  At return-time the interpreter automatically and quietly unlocks any
 760 //      objects acquired the current activation (frame).  Recall that the
 761 //      interpreter maintains an on-stack list of locks currently held by
 762 //      a frame.
 763 // I2:  If a method attempts to unlock an object that is not held by the
 764 //      the frame the interpreter throws IMSX.
 765 //
 766 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 767 // B() doesn't have provably balanced locking so it runs in the interpreter.
 768 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 769 // is still locked by A().
 770 //
 771 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 772 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 773 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 774 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 775 // Arguably given that the spec legislates the JNI case as undefined our implementation
 776 // could reasonably *avoid* checking owner in fast_unlock().
 777 // In the interest of performance we elide m->Owner==Self check in unlock.
 778 // A perfectly viable alternative is to elide the owner check except when
 779 // Xcheck:jni is enabled.
 780 
 781 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 782   assert(boxReg == rax, "");
 783   assert_different_registers(objReg, boxReg, tmpReg);
 784 
 785   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 786 
 787 #if INCLUDE_RTM_OPT
 788   if (UseRTMForStackLocks && use_rtm) {
 789     assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive");
 790     Label L_regular_unlock;
 791     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 792     andptr(tmpReg, markWord::lock_mask_in_place);                     // look at 2 lock bits
 793     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 01 unlocked
 794     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 795     xend();                                                           // otherwise end...
 796     jmp(DONE_LABEL);                                                  // ... and we're done
 797     bind(L_regular_unlock);
 798   }
 799 #endif
 800 
 801   if (!UseHeavyMonitors && !UseFastLocking) {
 802     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 803     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 804   }
 805   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 806   if (!UseHeavyMonitors) {
 807     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 808 #if INCLUDE_RTM_OPT
 809     if (UseFastLocking && use_rtm) {
 810       jcc(Assembler::zero, Stacked);
 811     } else
 812 #endif
 813     jccb(Assembler::zero, Stacked);
 814     if (UseFastLocking) {
 815       // If the owner is ANONYMOUS, we need to fix it.
 816       testb(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t) (intptr_t) ANONYMOUS_OWNER);
 817 #ifdef _LP64
 818       C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmpReg);
 819       Compile::current()->output()->add_stub(stub);
 820       jcc(Assembler::notEqual, stub->entry());
 821       bind(stub->continuation());
 822 #else
 823       // We can't easily implement this optimization on 32 bit because we don't have a thread register.
 824       // Call the slow-path instead.
 825       jcc(Assembler::notEqual, NO_COUNT);
 826 #endif
 827     }
 828   }
 829 
 830   // It's inflated.
 831 #if INCLUDE_RTM_OPT
 832   if (use_rtm) {
 833     Label L_regular_inflated_unlock;
 834     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 835     movptr(boxReg, Address(tmpReg, owner_offset));
 836     testptr(boxReg, boxReg);
 837     jccb(Assembler::notZero, L_regular_inflated_unlock);
 838     xend();
 839     jmp(DONE_LABEL);
 840     bind(L_regular_inflated_unlock);
 841   }
 842 #endif
 843 
 844   // Despite our balanced locking property we still check that m->_owner == Self
 845   // as java routines or native JNI code called by this thread might
 846   // have released the lock.
 847   // Refer to the comments in synchronizer.cpp for how we might encode extra
 848   // state in _succ so we can avoid fetching EntryList|cxq.
 849   //
 850   // If there's no contention try a 1-0 exit.  That is, exit without
 851   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 852   // we detect and recover from the race that the 1-0 exit admits.
 853   //
 854   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 855   // before it STs null into _owner, releasing the lock.  Updates
 856   // to data protected by the critical section must be visible before
 857   // we drop the lock (and thus before any other thread could acquire
 858   // the lock and observe the fields protected by the lock).
 859   // IA32's memory-model is SPO, so STs are ordered with respect to
 860   // each other and there's no need for an explicit barrier (fence).
 861   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 862 #ifndef _LP64
 863   // Note that we could employ various encoding schemes to reduce
 864   // the number of loads below (currently 4) to just 2 or 3.
 865   // Refer to the comments in synchronizer.cpp.
 866   // In practice the chain of fetches doesn't seem to impact performance, however.
 867   xorptr(boxReg, boxReg);
 868   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 869   jccb  (Assembler::notZero, DONE_LABEL);
 870   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 871   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 872   jccb  (Assembler::notZero, DONE_LABEL);
 873   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 874   jmpb  (DONE_LABEL);
 875 #else // _LP64
 876   // It's inflated
 877   Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath;
 878 
 879   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 880   jccb(Assembler::equal, LNotRecursive);
 881 
 882   // Recursive inflated unlock
 883   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 884   jmpb(LSuccess);
 885 
 886   bind(LNotRecursive);
 887   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 888   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 889   jccb  (Assembler::notZero, CheckSucc);
 890   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 891   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 892   jmpb  (DONE_LABEL);
 893 
 894   // Try to avoid passing control into the slow_path ...
 895   bind  (CheckSucc);
 896 
 897   // The following optional optimization can be elided if necessary
 898   // Effectively: if (succ == null) goto slow path
 899   // The code reduces the window for a race, however,
 900   // and thus benefits performance.
 901   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 902   jccb  (Assembler::zero, LGoSlowPath);
 903 
 904   xorptr(boxReg, boxReg);
 905   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 906   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 907 
 908   // Memory barrier/fence
 909   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 910   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 911   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 912   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 913   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 914   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 915   lock(); addl(Address(rsp, 0), 0);
 916 
 917   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 918   jccb  (Assembler::notZero, LSuccess);
 919 
 920   // Rare inopportune interleaving - race.
 921   // The successor vanished in the small window above.
 922   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 923   // We need to ensure progress and succession.
 924   // Try to reacquire the lock.
 925   // If that fails then the new owner is responsible for succession and this
 926   // thread needs to take no further action and can exit via the fast path (success).
 927   // If the re-acquire succeeds then pass control into the slow path.
 928   // As implemented, this latter mode is horrible because we generated more
 929   // coherence traffic on the lock *and* artificially extended the critical section
 930   // length while by virtue of passing control into the slow path.
 931 
 932   // box is really RAX -- the following CMPXCHG depends on that binding
 933   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 934   lock();
 935   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 936   // There's no successor so we tried to regrab the lock.
 937   // If that didn't work, then another thread grabbed the
 938   // lock so we're done (and exit was a success).
 939   jccb  (Assembler::notEqual, LSuccess);
 940   // Intentional fall-through into slow path
 941 
 942   bind  (LGoSlowPath);
 943   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 944   jmpb  (DONE_LABEL);
 945 
 946   bind  (LSuccess);
 947   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 948   jmpb  (DONE_LABEL);
 949 
 950 #endif
 951   if (!UseHeavyMonitors) {
 952     bind  (Stacked);
 953     if (UseFastLocking) {
 954       mov(boxReg, tmpReg);
 955       fast_unlock_impl(objReg, boxReg, tmpReg, NO_COUNT);
 956       jmp(COUNT);
 957     } else {
 958       movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 959       lock();
 960       cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 961     }
 962     // Intentional fall-thru into DONE_LABEL
 963   }
 964   bind(DONE_LABEL);
 965 
 966   // ZFlag == 1 count in fast path
 967   // ZFlag == 0 count in slow path
 968   jccb(Assembler::notZero, NO_COUNT);
 969 
 970   bind(COUNT);
 971   // Count monitors in fast path
 972 #ifndef _LP64
 973   get_thread(tmpReg);
 974   decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 975 #else // _LP64
 976   decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 977 #endif
 978 
 979   xorl(tmpReg, tmpReg); // Set ZF == 1
 980 
 981   bind(NO_COUNT);
 982 }
 983 
 984 //-------------------------------------------------------------------------------------------
 985 // Generic instructions support for use in .ad files C2 code generation
 986 
 987 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 988   if (dst != src) {
 989     movdqu(dst, src);
 990   }
 991   if (opcode == Op_AbsVD) {
 992     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 993   } else {
 994     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 995     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 996   }
 997 }
 998 
 999 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
1000   if (opcode == Op_AbsVD) {
1001     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
1002   } else {
1003     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
1004     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
1005   }
1006 }
1007 
1008 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
1009   if (dst != src) {
1010     movdqu(dst, src);
1011   }
1012   if (opcode == Op_AbsVF) {
1013     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
1014   } else {
1015     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
1016     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1017   }
1018 }
1019 
1020 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
1021   if (opcode == Op_AbsVF) {
1022     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
1023   } else {
1024     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
1025     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
1026   }
1027 }
1028 
1029 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
1030   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1031   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
1032 
1033   if (opcode == Op_MinV) {
1034     if (elem_bt == T_BYTE) {
1035       pminsb(dst, src);
1036     } else if (elem_bt == T_SHORT) {
1037       pminsw(dst, src);
1038     } else if (elem_bt == T_INT) {
1039       pminsd(dst, src);
1040     } else {
1041       assert(elem_bt == T_LONG, "required");
1042       assert(tmp == xmm0, "required");
1043       assert_different_registers(dst, src, tmp);
1044       movdqu(xmm0, dst);
1045       pcmpgtq(xmm0, src);
1046       blendvpd(dst, src);  // xmm0 as mask
1047     }
1048   } else { // opcode == Op_MaxV
1049     if (elem_bt == T_BYTE) {
1050       pmaxsb(dst, src);
1051     } else if (elem_bt == T_SHORT) {
1052       pmaxsw(dst, src);
1053     } else if (elem_bt == T_INT) {
1054       pmaxsd(dst, src);
1055     } else {
1056       assert(elem_bt == T_LONG, "required");
1057       assert(tmp == xmm0, "required");
1058       assert_different_registers(dst, src, tmp);
1059       movdqu(xmm0, src);
1060       pcmpgtq(xmm0, dst);
1061       blendvpd(dst, src);  // xmm0 as mask
1062     }
1063   }
1064 }
1065 
1066 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1067                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1068                                  int vlen_enc) {
1069   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1070 
1071   if (opcode == Op_MinV) {
1072     if (elem_bt == T_BYTE) {
1073       vpminsb(dst, src1, src2, vlen_enc);
1074     } else if (elem_bt == T_SHORT) {
1075       vpminsw(dst, src1, src2, vlen_enc);
1076     } else if (elem_bt == T_INT) {
1077       vpminsd(dst, src1, src2, vlen_enc);
1078     } else {
1079       assert(elem_bt == T_LONG, "required");
1080       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1081         vpminsq(dst, src1, src2, vlen_enc);
1082       } else {
1083         assert_different_registers(dst, src1, src2);
1084         vpcmpgtq(dst, src1, src2, vlen_enc);
1085         vblendvpd(dst, src1, src2, dst, vlen_enc);
1086       }
1087     }
1088   } else { // opcode == Op_MaxV
1089     if (elem_bt == T_BYTE) {
1090       vpmaxsb(dst, src1, src2, vlen_enc);
1091     } else if (elem_bt == T_SHORT) {
1092       vpmaxsw(dst, src1, src2, vlen_enc);
1093     } else if (elem_bt == T_INT) {
1094       vpmaxsd(dst, src1, src2, vlen_enc);
1095     } else {
1096       assert(elem_bt == T_LONG, "required");
1097       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1098         vpmaxsq(dst, src1, src2, vlen_enc);
1099       } else {
1100         assert_different_registers(dst, src1, src2);
1101         vpcmpgtq(dst, src1, src2, vlen_enc);
1102         vblendvpd(dst, src2, src1, dst, vlen_enc);
1103       }
1104     }
1105   }
1106 }
1107 
1108 // Float/Double min max
1109 
1110 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1111                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1112                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1113                                    int vlen_enc) {
1114   assert(UseAVX > 0, "required");
1115   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1116          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1117   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1118   assert_different_registers(a, b, tmp, atmp, btmp);
1119 
1120   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1121   bool is_double_word = is_double_word_type(elem_bt);
1122 
1123   if (!is_double_word && is_min) {
1124     vblendvps(atmp, a, b, a, vlen_enc);
1125     vblendvps(btmp, b, a, a, vlen_enc);
1126     vminps(tmp, atmp, btmp, vlen_enc);
1127     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1128     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
1129   } else if (!is_double_word && !is_min) {
1130     vblendvps(btmp, b, a, b, vlen_enc);
1131     vblendvps(atmp, a, b, b, vlen_enc);
1132     vmaxps(tmp, atmp, btmp, vlen_enc);
1133     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1134     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
1135   } else if (is_double_word && is_min) {
1136     vblendvpd(atmp, a, b, a, vlen_enc);
1137     vblendvpd(btmp, b, a, a, vlen_enc);
1138     vminpd(tmp, atmp, btmp, vlen_enc);
1139     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1140     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1141   } else {
1142     assert(is_double_word && !is_min, "sanity");
1143     vblendvpd(btmp, b, a, b, vlen_enc);
1144     vblendvpd(atmp, a, b, b, vlen_enc);
1145     vmaxpd(tmp, atmp, btmp, vlen_enc);
1146     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1147     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1148   }
1149 }
1150 
1151 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1152                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1153                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1154                                     int vlen_enc) {
1155   assert(UseAVX > 2, "required");
1156   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1157          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1158   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1159   assert_different_registers(dst, a, b, atmp, btmp);
1160 
1161   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1162   bool is_double_word = is_double_word_type(elem_bt);
1163   bool merge = true;
1164 
1165   if (!is_double_word && is_min) {
1166     evpmovd2m(ktmp, a, vlen_enc);
1167     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1168     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1169     vminps(dst, atmp, btmp, vlen_enc);
1170     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1171     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1172   } else if (!is_double_word && !is_min) {
1173     evpmovd2m(ktmp, b, vlen_enc);
1174     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1175     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1176     vmaxps(dst, atmp, btmp, vlen_enc);
1177     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1178     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1179   } else if (is_double_word && is_min) {
1180     evpmovq2m(ktmp, a, vlen_enc);
1181     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1182     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1183     vminpd(dst, atmp, btmp, vlen_enc);
1184     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1185     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1186   } else {
1187     assert(is_double_word && !is_min, "sanity");
1188     evpmovq2m(ktmp, b, vlen_enc);
1189     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1190     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1191     vmaxpd(dst, atmp, btmp, vlen_enc);
1192     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1193     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1194   }
1195 }
1196 
1197 // Float/Double signum
1198 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1199   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1200 
1201   Label DONE_LABEL;
1202 
1203   if (opcode == Op_SignumF) {
1204     assert(UseSSE > 0, "required");
1205     ucomiss(dst, zero);
1206     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1207     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1208     movflt(dst, one);
1209     jcc(Assembler::above, DONE_LABEL);
1210     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1211   } else if (opcode == Op_SignumD) {
1212     assert(UseSSE > 1, "required");
1213     ucomisd(dst, zero);
1214     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1215     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1216     movdbl(dst, one);
1217     jcc(Assembler::above, DONE_LABEL);
1218     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1219   }
1220 
1221   bind(DONE_LABEL);
1222 }
1223 
1224 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1225   if (sign) {
1226     pmovsxbw(dst, src);
1227   } else {
1228     pmovzxbw(dst, src);
1229   }
1230 }
1231 
1232 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1233   if (sign) {
1234     vpmovsxbw(dst, src, vector_len);
1235   } else {
1236     vpmovzxbw(dst, src, vector_len);
1237   }
1238 }
1239 
1240 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1241   if (sign) {
1242     vpmovsxbd(dst, src, vector_len);
1243   } else {
1244     vpmovzxbd(dst, src, vector_len);
1245   }
1246 }
1247 
1248 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1249   if (sign) {
1250     vpmovsxwd(dst, src, vector_len);
1251   } else {
1252     vpmovzxwd(dst, src, vector_len);
1253   }
1254 }
1255 
1256 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1257                                      int shift, int vector_len) {
1258   if (opcode == Op_RotateLeftV) {
1259     if (etype == T_INT) {
1260       evprold(dst, src, shift, vector_len);
1261     } else {
1262       assert(etype == T_LONG, "expected type T_LONG");
1263       evprolq(dst, src, shift, vector_len);
1264     }
1265   } else {
1266     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1267     if (etype == T_INT) {
1268       evprord(dst, src, shift, vector_len);
1269     } else {
1270       assert(etype == T_LONG, "expected type T_LONG");
1271       evprorq(dst, src, shift, vector_len);
1272     }
1273   }
1274 }
1275 
1276 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1277                                      XMMRegister shift, int vector_len) {
1278   if (opcode == Op_RotateLeftV) {
1279     if (etype == T_INT) {
1280       evprolvd(dst, src, shift, vector_len);
1281     } else {
1282       assert(etype == T_LONG, "expected type T_LONG");
1283       evprolvq(dst, src, shift, vector_len);
1284     }
1285   } else {
1286     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1287     if (etype == T_INT) {
1288       evprorvd(dst, src, shift, vector_len);
1289     } else {
1290       assert(etype == T_LONG, "expected type T_LONG");
1291       evprorvq(dst, src, shift, vector_len);
1292     }
1293   }
1294 }
1295 
1296 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1297   if (opcode == Op_RShiftVI) {
1298     psrad(dst, shift);
1299   } else if (opcode == Op_LShiftVI) {
1300     pslld(dst, shift);
1301   } else {
1302     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1303     psrld(dst, shift);
1304   }
1305 }
1306 
1307 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1308   switch (opcode) {
1309     case Op_RShiftVI:  psrad(dst, shift); break;
1310     case Op_LShiftVI:  pslld(dst, shift); break;
1311     case Op_URShiftVI: psrld(dst, shift); break;
1312 
1313     default: assert(false, "%s", NodeClassNames[opcode]);
1314   }
1315 }
1316 
1317 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1318   if (opcode == Op_RShiftVI) {
1319     vpsrad(dst, nds, shift, vector_len);
1320   } else if (opcode == Op_LShiftVI) {
1321     vpslld(dst, nds, shift, vector_len);
1322   } else {
1323     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1324     vpsrld(dst, nds, shift, vector_len);
1325   }
1326 }
1327 
1328 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1329   switch (opcode) {
1330     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1331     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1332     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1333 
1334     default: assert(false, "%s", NodeClassNames[opcode]);
1335   }
1336 }
1337 
1338 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1339   switch (opcode) {
1340     case Op_RShiftVB:  // fall-through
1341     case Op_RShiftVS:  psraw(dst, shift); break;
1342 
1343     case Op_LShiftVB:  // fall-through
1344     case Op_LShiftVS:  psllw(dst, shift);   break;
1345 
1346     case Op_URShiftVS: // fall-through
1347     case Op_URShiftVB: psrlw(dst, shift);  break;
1348 
1349     default: assert(false, "%s", NodeClassNames[opcode]);
1350   }
1351 }
1352 
1353 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1354   switch (opcode) {
1355     case Op_RShiftVB:  // fall-through
1356     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1357 
1358     case Op_LShiftVB:  // fall-through
1359     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1360 
1361     case Op_URShiftVS: // fall-through
1362     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1363 
1364     default: assert(false, "%s", NodeClassNames[opcode]);
1365   }
1366 }
1367 
1368 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1369   switch (opcode) {
1370     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1371     case Op_LShiftVL:  psllq(dst, shift); break;
1372     case Op_URShiftVL: psrlq(dst, shift); break;
1373 
1374     default: assert(false, "%s", NodeClassNames[opcode]);
1375   }
1376 }
1377 
1378 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1379   if (opcode == Op_RShiftVL) {
1380     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1381   } else if (opcode == Op_LShiftVL) {
1382     psllq(dst, shift);
1383   } else {
1384     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1385     psrlq(dst, shift);
1386   }
1387 }
1388 
1389 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1390   switch (opcode) {
1391     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1392     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1393     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1394 
1395     default: assert(false, "%s", NodeClassNames[opcode]);
1396   }
1397 }
1398 
1399 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1400   if (opcode == Op_RShiftVL) {
1401     evpsraq(dst, nds, shift, vector_len);
1402   } else if (opcode == Op_LShiftVL) {
1403     vpsllq(dst, nds, shift, vector_len);
1404   } else {
1405     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1406     vpsrlq(dst, nds, shift, vector_len);
1407   }
1408 }
1409 
1410 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1411   switch (opcode) {
1412     case Op_RShiftVB:  // fall-through
1413     case Op_RShiftVS:  // fall-through
1414     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1415 
1416     case Op_LShiftVB:  // fall-through
1417     case Op_LShiftVS:  // fall-through
1418     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1419 
1420     case Op_URShiftVB: // fall-through
1421     case Op_URShiftVS: // fall-through
1422     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1423 
1424     default: assert(false, "%s", NodeClassNames[opcode]);
1425   }
1426 }
1427 
1428 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1429   switch (opcode) {
1430     case Op_RShiftVB:  // fall-through
1431     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1432 
1433     case Op_LShiftVB:  // fall-through
1434     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1435 
1436     case Op_URShiftVB: // fall-through
1437     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1438 
1439     default: assert(false, "%s", NodeClassNames[opcode]);
1440   }
1441 }
1442 
1443 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1444   assert(UseAVX >= 2, "required");
1445   switch (opcode) {
1446     case Op_RShiftVL: {
1447       if (UseAVX > 2) {
1448         assert(tmp == xnoreg, "not used");
1449         if (!VM_Version::supports_avx512vl()) {
1450           vlen_enc = Assembler::AVX_512bit;
1451         }
1452         evpsravq(dst, src, shift, vlen_enc);
1453       } else {
1454         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1455         vpsrlvq(dst, src, shift, vlen_enc);
1456         vpsrlvq(tmp, tmp, shift, vlen_enc);
1457         vpxor(dst, dst, tmp, vlen_enc);
1458         vpsubq(dst, dst, tmp, vlen_enc);
1459       }
1460       break;
1461     }
1462     case Op_LShiftVL: {
1463       assert(tmp == xnoreg, "not used");
1464       vpsllvq(dst, src, shift, vlen_enc);
1465       break;
1466     }
1467     case Op_URShiftVL: {
1468       assert(tmp == xnoreg, "not used");
1469       vpsrlvq(dst, src, shift, vlen_enc);
1470       break;
1471     }
1472     default: assert(false, "%s", NodeClassNames[opcode]);
1473   }
1474 }
1475 
1476 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1477 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1478   assert(opcode == Op_LShiftVB ||
1479          opcode == Op_RShiftVB ||
1480          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1481   bool sign = (opcode != Op_URShiftVB);
1482   assert(vector_len == 0, "required");
1483   vextendbd(sign, dst, src, 1);
1484   vpmovzxbd(vtmp, shift, 1);
1485   varshiftd(opcode, dst, dst, vtmp, 1);
1486   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1487   vextracti128_high(vtmp, dst);
1488   vpackusdw(dst, dst, vtmp, 0);
1489 }
1490 
1491 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1492 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1493   assert(opcode == Op_LShiftVB ||
1494          opcode == Op_RShiftVB ||
1495          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1496   bool sign = (opcode != Op_URShiftVB);
1497   int ext_vector_len = vector_len + 1;
1498   vextendbw(sign, dst, src, ext_vector_len);
1499   vpmovzxbw(vtmp, shift, ext_vector_len);
1500   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1501   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1502   if (vector_len == 0) {
1503     vextracti128_high(vtmp, dst);
1504     vpackuswb(dst, dst, vtmp, vector_len);
1505   } else {
1506     vextracti64x4_high(vtmp, dst);
1507     vpackuswb(dst, dst, vtmp, vector_len);
1508     vpermq(dst, dst, 0xD8, vector_len);
1509   }
1510 }
1511 
1512 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1513   switch(typ) {
1514     case T_BYTE:
1515       pinsrb(dst, val, idx);
1516       break;
1517     case T_SHORT:
1518       pinsrw(dst, val, idx);
1519       break;
1520     case T_INT:
1521       pinsrd(dst, val, idx);
1522       break;
1523     case T_LONG:
1524       pinsrq(dst, val, idx);
1525       break;
1526     default:
1527       assert(false,"Should not reach here.");
1528       break;
1529   }
1530 }
1531 
1532 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1533   switch(typ) {
1534     case T_BYTE:
1535       vpinsrb(dst, src, val, idx);
1536       break;
1537     case T_SHORT:
1538       vpinsrw(dst, src, val, idx);
1539       break;
1540     case T_INT:
1541       vpinsrd(dst, src, val, idx);
1542       break;
1543     case T_LONG:
1544       vpinsrq(dst, src, val, idx);
1545       break;
1546     default:
1547       assert(false,"Should not reach here.");
1548       break;
1549   }
1550 }
1551 
1552 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1553   switch(typ) {
1554     case T_INT:
1555       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1556       break;
1557     case T_FLOAT:
1558       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1559       break;
1560     case T_LONG:
1561       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1562       break;
1563     case T_DOUBLE:
1564       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1565       break;
1566     default:
1567       assert(false,"Should not reach here.");
1568       break;
1569   }
1570 }
1571 
1572 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1573   switch(typ) {
1574     case T_INT:
1575       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1576       break;
1577     case T_FLOAT:
1578       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1579       break;
1580     case T_LONG:
1581       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1582       break;
1583     case T_DOUBLE:
1584       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1585       break;
1586     default:
1587       assert(false,"Should not reach here.");
1588       break;
1589   }
1590 }
1591 
1592 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1593   switch(typ) {
1594     case T_INT:
1595       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1596       break;
1597     case T_FLOAT:
1598       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1599       break;
1600     case T_LONG:
1601       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1602       break;
1603     case T_DOUBLE:
1604       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1605       break;
1606     default:
1607       assert(false,"Should not reach here.");
1608       break;
1609   }
1610 }
1611 
1612 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1613   if (vlen_in_bytes <= 16) {
1614     pxor (dst, dst);
1615     psubb(dst, src);
1616     switch (elem_bt) {
1617       case T_BYTE:   /* nothing to do */ break;
1618       case T_SHORT:  pmovsxbw(dst, dst); break;
1619       case T_INT:    pmovsxbd(dst, dst); break;
1620       case T_FLOAT:  pmovsxbd(dst, dst); break;
1621       case T_LONG:   pmovsxbq(dst, dst); break;
1622       case T_DOUBLE: pmovsxbq(dst, dst); break;
1623 
1624       default: assert(false, "%s", type2name(elem_bt));
1625     }
1626   } else {
1627     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1628     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1629 
1630     vpxor (dst, dst, dst, vlen_enc);
1631     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1632 
1633     switch (elem_bt) {
1634       case T_BYTE:   /* nothing to do */            break;
1635       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1636       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1637       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1638       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1639       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1640 
1641       default: assert(false, "%s", type2name(elem_bt));
1642     }
1643   }
1644 }
1645 
1646 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1647   if (novlbwdq) {
1648     vpmovsxbd(xtmp, src, vlen_enc);
1649     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1650             Assembler::eq, true, vlen_enc, noreg);
1651   } else {
1652     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1653     vpsubb(xtmp, xtmp, src, vlen_enc);
1654     evpmovb2m(dst, xtmp, vlen_enc);
1655   }
1656 }
1657 
1658 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
1659   switch (vlen_in_bytes) {
1660     case 4:  movdl(dst, src);   break;
1661     case 8:  movq(dst, src);    break;
1662     case 16: movdqu(dst, src);  break;
1663     case 32: vmovdqu(dst, src); break;
1664     case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1665     default: ShouldNotReachHere();
1666   }
1667 }
1668 
1669 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1670   assert(rscratch != noreg || always_reachable(src), "missing");
1671 
1672   if (reachable(src)) {
1673     load_vector(dst, as_Address(src), vlen_in_bytes);
1674   } else {
1675     lea(rscratch, src);
1676     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
1677   }
1678 }
1679 
1680 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1681   int vlen_enc = vector_length_encoding(vlen);
1682   if (VM_Version::supports_avx()) {
1683     if (bt == T_LONG) {
1684       if (VM_Version::supports_avx2()) {
1685         vpbroadcastq(dst, src, vlen_enc);
1686       } else {
1687         vmovddup(dst, src, vlen_enc);
1688       }
1689     } else if (bt == T_DOUBLE) {
1690       if (vlen_enc != Assembler::AVX_128bit) {
1691         vbroadcastsd(dst, src, vlen_enc, noreg);
1692       } else {
1693         vmovddup(dst, src, vlen_enc);
1694       }
1695     } else {
1696       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1697         vpbroadcastd(dst, src, vlen_enc);
1698       } else {
1699         vbroadcastss(dst, src, vlen_enc);
1700       }
1701     }
1702   } else if (VM_Version::supports_sse3()) {
1703     movddup(dst, src);
1704   } else {
1705     movq(dst, src);
1706     if (vlen == 16) {
1707       punpcklqdq(dst, dst);
1708     }
1709   }
1710 }
1711 
1712 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1713   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1714   int offset = exact_log2(type2aelembytes(bt)) << 6;
1715   if (is_floating_point_type(bt)) {
1716     offset += 128;
1717   }
1718   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1719   load_vector(dst, addr, vlen_in_bytes);
1720 }
1721 
1722 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1723 
1724 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1725   int vector_len = Assembler::AVX_128bit;
1726 
1727   switch (opcode) {
1728     case Op_AndReductionV:  pand(dst, src); break;
1729     case Op_OrReductionV:   por (dst, src); break;
1730     case Op_XorReductionV:  pxor(dst, src); break;
1731     case Op_MinReductionV:
1732       switch (typ) {
1733         case T_BYTE:        pminsb(dst, src); break;
1734         case T_SHORT:       pminsw(dst, src); break;
1735         case T_INT:         pminsd(dst, src); break;
1736         case T_LONG:        assert(UseAVX > 2, "required");
1737                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1738         default:            assert(false, "wrong type");
1739       }
1740       break;
1741     case Op_MaxReductionV:
1742       switch (typ) {
1743         case T_BYTE:        pmaxsb(dst, src); break;
1744         case T_SHORT:       pmaxsw(dst, src); break;
1745         case T_INT:         pmaxsd(dst, src); break;
1746         case T_LONG:        assert(UseAVX > 2, "required");
1747                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1748         default:            assert(false, "wrong type");
1749       }
1750       break;
1751     case Op_AddReductionVF: addss(dst, src); break;
1752     case Op_AddReductionVD: addsd(dst, src); break;
1753     case Op_AddReductionVI:
1754       switch (typ) {
1755         case T_BYTE:        paddb(dst, src); break;
1756         case T_SHORT:       paddw(dst, src); break;
1757         case T_INT:         paddd(dst, src); break;
1758         default:            assert(false, "wrong type");
1759       }
1760       break;
1761     case Op_AddReductionVL: paddq(dst, src); break;
1762     case Op_MulReductionVF: mulss(dst, src); break;
1763     case Op_MulReductionVD: mulsd(dst, src); break;
1764     case Op_MulReductionVI:
1765       switch (typ) {
1766         case T_SHORT:       pmullw(dst, src); break;
1767         case T_INT:         pmulld(dst, src); break;
1768         default:            assert(false, "wrong type");
1769       }
1770       break;
1771     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1772                             evpmullq(dst, dst, src, vector_len); break;
1773     default:                assert(false, "wrong opcode");
1774   }
1775 }
1776 
1777 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1778   int vector_len = Assembler::AVX_256bit;
1779 
1780   switch (opcode) {
1781     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1782     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1783     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1784     case Op_MinReductionV:
1785       switch (typ) {
1786         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1787         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1788         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1789         case T_LONG:        assert(UseAVX > 2, "required");
1790                             vpminsq(dst, src1, src2, vector_len); break;
1791         default:            assert(false, "wrong type");
1792       }
1793       break;
1794     case Op_MaxReductionV:
1795       switch (typ) {
1796         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1797         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1798         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1799         case T_LONG:        assert(UseAVX > 2, "required");
1800                             vpmaxsq(dst, src1, src2, vector_len); break;
1801         default:            assert(false, "wrong type");
1802       }
1803       break;
1804     case Op_AddReductionVI:
1805       switch (typ) {
1806         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1807         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1808         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1809         default:            assert(false, "wrong type");
1810       }
1811       break;
1812     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1813     case Op_MulReductionVI:
1814       switch (typ) {
1815         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1816         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1817         default:            assert(false, "wrong type");
1818       }
1819       break;
1820     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1821     default:                assert(false, "wrong opcode");
1822   }
1823 }
1824 
1825 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1826                                   XMMRegister dst, XMMRegister src,
1827                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1828   switch (opcode) {
1829     case Op_AddReductionVF:
1830     case Op_MulReductionVF:
1831       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1832       break;
1833 
1834     case Op_AddReductionVD:
1835     case Op_MulReductionVD:
1836       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1837       break;
1838 
1839     default: assert(false, "wrong opcode");
1840   }
1841 }
1842 
1843 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1844                              Register dst, Register src1, XMMRegister src2,
1845                              XMMRegister vtmp1, XMMRegister vtmp2) {
1846   switch (vlen) {
1847     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1848     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1849     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1850     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1851 
1852     default: assert(false, "wrong vector length");
1853   }
1854 }
1855 
1856 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1857                              Register dst, Register src1, XMMRegister src2,
1858                              XMMRegister vtmp1, XMMRegister vtmp2) {
1859   switch (vlen) {
1860     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1861     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1862     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1863     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1864 
1865     default: assert(false, "wrong vector length");
1866   }
1867 }
1868 
1869 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1870                              Register dst, Register src1, XMMRegister src2,
1871                              XMMRegister vtmp1, XMMRegister vtmp2) {
1872   switch (vlen) {
1873     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1874     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1875     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1876     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1877 
1878     default: assert(false, "wrong vector length");
1879   }
1880 }
1881 
1882 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1883                              Register dst, Register src1, XMMRegister src2,
1884                              XMMRegister vtmp1, XMMRegister vtmp2) {
1885   switch (vlen) {
1886     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1887     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1888     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1889     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1890 
1891     default: assert(false, "wrong vector length");
1892   }
1893 }
1894 
1895 #ifdef _LP64
1896 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1897                              Register dst, Register src1, XMMRegister src2,
1898                              XMMRegister vtmp1, XMMRegister vtmp2) {
1899   switch (vlen) {
1900     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1901     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1902     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1903 
1904     default: assert(false, "wrong vector length");
1905   }
1906 }
1907 #endif // _LP64
1908 
1909 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1910   switch (vlen) {
1911     case 2:
1912       assert(vtmp2 == xnoreg, "");
1913       reduce2F(opcode, dst, src, vtmp1);
1914       break;
1915     case 4:
1916       assert(vtmp2 == xnoreg, "");
1917       reduce4F(opcode, dst, src, vtmp1);
1918       break;
1919     case 8:
1920       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1921       break;
1922     case 16:
1923       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1924       break;
1925     default: assert(false, "wrong vector length");
1926   }
1927 }
1928 
1929 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1930   switch (vlen) {
1931     case 2:
1932       assert(vtmp2 == xnoreg, "");
1933       reduce2D(opcode, dst, src, vtmp1);
1934       break;
1935     case 4:
1936       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1937       break;
1938     case 8:
1939       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1940       break;
1941     default: assert(false, "wrong vector length");
1942   }
1943 }
1944 
1945 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1946   if (opcode == Op_AddReductionVI) {
1947     if (vtmp1 != src2) {
1948       movdqu(vtmp1, src2);
1949     }
1950     phaddd(vtmp1, vtmp1);
1951   } else {
1952     pshufd(vtmp1, src2, 0x1);
1953     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1954   }
1955   movdl(vtmp2, src1);
1956   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1957   movdl(dst, vtmp1);
1958 }
1959 
1960 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1961   if (opcode == Op_AddReductionVI) {
1962     if (vtmp1 != src2) {
1963       movdqu(vtmp1, src2);
1964     }
1965     phaddd(vtmp1, src2);
1966     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1967   } else {
1968     pshufd(vtmp2, src2, 0xE);
1969     reduce_operation_128(T_INT, opcode, vtmp2, src2);
1970     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1971   }
1972 }
1973 
1974 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1975   if (opcode == Op_AddReductionVI) {
1976     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1977     vextracti128_high(vtmp2, vtmp1);
1978     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1979     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1980   } else {
1981     vextracti128_high(vtmp1, src2);
1982     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1983     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1984   }
1985 }
1986 
1987 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1988   vextracti64x4_high(vtmp2, src2);
1989   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
1990   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1991 }
1992 
1993 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1994   pshufd(vtmp2, src2, 0x1);
1995   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1996   movdqu(vtmp1, vtmp2);
1997   psrldq(vtmp1, 2);
1998   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1999   movdqu(vtmp2, vtmp1);
2000   psrldq(vtmp2, 1);
2001   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2002   movdl(vtmp2, src1);
2003   pmovsxbd(vtmp1, vtmp1);
2004   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2005   pextrb(dst, vtmp1, 0x0);
2006   movsbl(dst, dst);
2007 }
2008 
2009 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2010   pshufd(vtmp1, src2, 0xE);
2011   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2012   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2013 }
2014 
2015 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2016   vextracti128_high(vtmp2, src2);
2017   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2018   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2019 }
2020 
2021 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2022   vextracti64x4_high(vtmp1, src2);
2023   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2024   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2025 }
2026 
2027 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2028   pmovsxbw(vtmp2, src2);
2029   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2030 }
2031 
2032 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2033   if (UseAVX > 1) {
2034     int vector_len = Assembler::AVX_256bit;
2035     vpmovsxbw(vtmp1, src2, vector_len);
2036     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2037   } else {
2038     pmovsxbw(vtmp2, src2);
2039     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2040     pshufd(vtmp2, src2, 0x1);
2041     pmovsxbw(vtmp2, src2);
2042     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2043   }
2044 }
2045 
2046 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2047   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2048     int vector_len = Assembler::AVX_512bit;
2049     vpmovsxbw(vtmp1, src2, vector_len);
2050     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2051   } else {
2052     assert(UseAVX >= 2,"Should not reach here.");
2053     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2054     vextracti128_high(vtmp2, src2);
2055     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2056   }
2057 }
2058 
2059 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2060   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2061   vextracti64x4_high(vtmp2, src2);
2062   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2063 }
2064 
2065 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2066   if (opcode == Op_AddReductionVI) {
2067     if (vtmp1 != src2) {
2068       movdqu(vtmp1, src2);
2069     }
2070     phaddw(vtmp1, vtmp1);
2071     phaddw(vtmp1, vtmp1);
2072   } else {
2073     pshufd(vtmp2, src2, 0x1);
2074     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2075     movdqu(vtmp1, vtmp2);
2076     psrldq(vtmp1, 2);
2077     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2078   }
2079   movdl(vtmp2, src1);
2080   pmovsxwd(vtmp1, vtmp1);
2081   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2082   pextrw(dst, vtmp1, 0x0);
2083   movswl(dst, dst);
2084 }
2085 
2086 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2087   if (opcode == Op_AddReductionVI) {
2088     if (vtmp1 != src2) {
2089       movdqu(vtmp1, src2);
2090     }
2091     phaddw(vtmp1, src2);
2092   } else {
2093     pshufd(vtmp1, src2, 0xE);
2094     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2095   }
2096   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2097 }
2098 
2099 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2100   if (opcode == Op_AddReductionVI) {
2101     int vector_len = Assembler::AVX_256bit;
2102     vphaddw(vtmp2, src2, src2, vector_len);
2103     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2104   } else {
2105     vextracti128_high(vtmp2, src2);
2106     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2107   }
2108   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2109 }
2110 
2111 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2112   int vector_len = Assembler::AVX_256bit;
2113   vextracti64x4_high(vtmp1, src2);
2114   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2115   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2116 }
2117 
2118 #ifdef _LP64
2119 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2120   pshufd(vtmp2, src2, 0xE);
2121   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2122   movdq(vtmp1, src1);
2123   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2124   movdq(dst, vtmp1);
2125 }
2126 
2127 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2128   vextracti128_high(vtmp1, src2);
2129   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2130   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2131 }
2132 
2133 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2134   vextracti64x4_high(vtmp2, src2);
2135   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2136   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2137 }
2138 
2139 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2140   mov64(temp, -1L);
2141   bzhiq(temp, temp, len);
2142   kmovql(dst, temp);
2143 }
2144 #endif // _LP64
2145 
2146 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2147   reduce_operation_128(T_FLOAT, opcode, dst, src);
2148   pshufd(vtmp, src, 0x1);
2149   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2150 }
2151 
2152 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2153   reduce2F(opcode, dst, src, vtmp);
2154   pshufd(vtmp, src, 0x2);
2155   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2156   pshufd(vtmp, src, 0x3);
2157   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2158 }
2159 
2160 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2161   reduce4F(opcode, dst, src, vtmp2);
2162   vextractf128_high(vtmp2, src);
2163   reduce4F(opcode, dst, vtmp2, vtmp1);
2164 }
2165 
2166 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2167   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2168   vextracti64x4_high(vtmp1, src);
2169   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2170 }
2171 
2172 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2173   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2174   pshufd(vtmp, src, 0xE);
2175   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2176 }
2177 
2178 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2179   reduce2D(opcode, dst, src, vtmp2);
2180   vextractf128_high(vtmp2, src);
2181   reduce2D(opcode, dst, vtmp2, vtmp1);
2182 }
2183 
2184 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2185   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2186   vextracti64x4_high(vtmp1, src);
2187   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2188 }
2189 
2190 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2191   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2192 }
2193 
2194 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2195   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2196 }
2197 
2198 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2199                                  int vec_enc) {
2200   switch(elem_bt) {
2201     case T_INT:
2202     case T_FLOAT:
2203       vmaskmovps(dst, src, mask, vec_enc);
2204       break;
2205     case T_LONG:
2206     case T_DOUBLE:
2207       vmaskmovpd(dst, src, mask, vec_enc);
2208       break;
2209     default:
2210       fatal("Unsupported type %s", type2name(elem_bt));
2211       break;
2212   }
2213 }
2214 
2215 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2216                                  int vec_enc) {
2217   switch(elem_bt) {
2218     case T_INT:
2219     case T_FLOAT:
2220       vmaskmovps(dst, src, mask, vec_enc);
2221       break;
2222     case T_LONG:
2223     case T_DOUBLE:
2224       vmaskmovpd(dst, src, mask, vec_enc);
2225       break;
2226     default:
2227       fatal("Unsupported type %s", type2name(elem_bt));
2228       break;
2229   }
2230 }
2231 
2232 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2233                                           XMMRegister dst, XMMRegister src,
2234                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2235                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2236   int permconst[] = {1, 14};
2237   XMMRegister wsrc = src;
2238   XMMRegister wdst = xmm_0;
2239   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2240 
2241   int vlen_enc = Assembler::AVX_128bit;
2242   if (vlen == 16) {
2243     vlen_enc = Assembler::AVX_256bit;
2244   }
2245 
2246   for (int i = log2(vlen) - 1; i >=0; i--) {
2247     if (i == 0 && !is_dst_valid) {
2248       wdst = dst;
2249     }
2250     if (i == 3) {
2251       vextracti64x4_high(wtmp, wsrc);
2252     } else if (i == 2) {
2253       vextracti128_high(wtmp, wsrc);
2254     } else { // i = [0,1]
2255       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2256     }
2257     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2258     wsrc = wdst;
2259     vlen_enc = Assembler::AVX_128bit;
2260   }
2261   if (is_dst_valid) {
2262     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2263   }
2264 }
2265 
2266 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2267                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2268                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2269   XMMRegister wsrc = src;
2270   XMMRegister wdst = xmm_0;
2271   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2272   int vlen_enc = Assembler::AVX_128bit;
2273   if (vlen == 8) {
2274     vlen_enc = Assembler::AVX_256bit;
2275   }
2276   for (int i = log2(vlen) - 1; i >=0; i--) {
2277     if (i == 0 && !is_dst_valid) {
2278       wdst = dst;
2279     }
2280     if (i == 1) {
2281       vextracti128_high(wtmp, wsrc);
2282     } else if (i == 2) {
2283       vextracti64x4_high(wtmp, wsrc);
2284     } else {
2285       assert(i == 0, "%d", i);
2286       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2287     }
2288     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2289     wsrc = wdst;
2290     vlen_enc = Assembler::AVX_128bit;
2291   }
2292   if (is_dst_valid) {
2293     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2294   }
2295 }
2296 
2297 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2298   switch (bt) {
2299     case T_BYTE:  pextrb(dst, src, idx); break;
2300     case T_SHORT: pextrw(dst, src, idx); break;
2301     case T_INT:   pextrd(dst, src, idx); break;
2302     case T_LONG:  pextrq(dst, src, idx); break;
2303 
2304     default:
2305       assert(false,"Should not reach here.");
2306       break;
2307   }
2308 }
2309 
2310 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2311   int esize =  type2aelembytes(typ);
2312   int elem_per_lane = 16/esize;
2313   int lane = elemindex / elem_per_lane;
2314   int eindex = elemindex % elem_per_lane;
2315 
2316   if (lane >= 2) {
2317     assert(UseAVX > 2, "required");
2318     vextractf32x4(dst, src, lane & 3);
2319     return dst;
2320   } else if (lane > 0) {
2321     assert(UseAVX > 0, "required");
2322     vextractf128(dst, src, lane);
2323     return dst;
2324   } else {
2325     return src;
2326   }
2327 }
2328 
2329 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2330   int esize =  type2aelembytes(typ);
2331   int elem_per_lane = 16/esize;
2332   int eindex = elemindex % elem_per_lane;
2333   assert(is_integral_type(typ),"required");
2334 
2335   if (eindex == 0) {
2336     if (typ == T_LONG) {
2337       movq(dst, src);
2338     } else {
2339       movdl(dst, src);
2340       if (typ == T_BYTE)
2341         movsbl(dst, dst);
2342       else if (typ == T_SHORT)
2343         movswl(dst, dst);
2344     }
2345   } else {
2346     extract(typ, dst, src, eindex);
2347   }
2348 }
2349 
2350 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2351   int esize =  type2aelembytes(typ);
2352   int elem_per_lane = 16/esize;
2353   int eindex = elemindex % elem_per_lane;
2354   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2355 
2356   if (eindex == 0) {
2357     movq(dst, src);
2358   } else {
2359     if (typ == T_FLOAT) {
2360       if (UseAVX == 0) {
2361         movdqu(dst, src);
2362         shufps(dst, dst, eindex);
2363       } else {
2364         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2365       }
2366     } else {
2367       if (UseAVX == 0) {
2368         movdqu(dst, src);
2369         psrldq(dst, eindex*esize);
2370       } else {
2371         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2372       }
2373       movq(dst, dst);
2374     }
2375   }
2376   // Zero upper bits
2377   if (typ == T_FLOAT) {
2378     if (UseAVX == 0) {
2379       assert(vtmp != xnoreg, "required.");
2380       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2381       pand(dst, vtmp);
2382     } else {
2383       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2384     }
2385   }
2386 }
2387 
2388 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2389   switch(typ) {
2390     case T_BYTE:
2391     case T_BOOLEAN:
2392       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2393       break;
2394     case T_SHORT:
2395     case T_CHAR:
2396       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2397       break;
2398     case T_INT:
2399     case T_FLOAT:
2400       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2401       break;
2402     case T_LONG:
2403     case T_DOUBLE:
2404       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2405       break;
2406     default:
2407       assert(false,"Should not reach here.");
2408       break;
2409   }
2410 }
2411 
2412 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2413   assert(rscratch != noreg || always_reachable(src2), "missing");
2414 
2415   switch(typ) {
2416     case T_BOOLEAN:
2417     case T_BYTE:
2418       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2419       break;
2420     case T_CHAR:
2421     case T_SHORT:
2422       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2423       break;
2424     case T_INT:
2425     case T_FLOAT:
2426       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2427       break;
2428     case T_LONG:
2429     case T_DOUBLE:
2430       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2431       break;
2432     default:
2433       assert(false,"Should not reach here.");
2434       break;
2435   }
2436 }
2437 
2438 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2439   switch(typ) {
2440     case T_BYTE:
2441       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2442       break;
2443     case T_SHORT:
2444       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2445       break;
2446     case T_INT:
2447     case T_FLOAT:
2448       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2449       break;
2450     case T_LONG:
2451     case T_DOUBLE:
2452       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2453       break;
2454     default:
2455       assert(false,"Should not reach here.");
2456       break;
2457   }
2458 }
2459 
2460 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2461   assert(vlen_in_bytes <= 32, "");
2462   int esize = type2aelembytes(bt);
2463   if (vlen_in_bytes == 32) {
2464     assert(vtmp == xnoreg, "required.");
2465     if (esize >= 4) {
2466       vtestps(src1, src2, AVX_256bit);
2467     } else {
2468       vptest(src1, src2, AVX_256bit);
2469     }
2470     return;
2471   }
2472   if (vlen_in_bytes < 16) {
2473     // Duplicate the lower part to fill the whole register,
2474     // Don't need to do so for src2
2475     assert(vtmp != xnoreg, "required");
2476     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2477     pshufd(vtmp, src1, shuffle_imm);
2478   } else {
2479     assert(vtmp == xnoreg, "required");
2480     vtmp = src1;
2481   }
2482   if (esize >= 4 && VM_Version::supports_avx()) {
2483     vtestps(vtmp, src2, AVX_128bit);
2484   } else {
2485     ptest(vtmp, src2);
2486   }
2487 }
2488 
2489 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2490   assert(UseAVX >= 2, "required");
2491 #ifdef ASSERT
2492   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2493   bool is_bw_supported = VM_Version::supports_avx512bw();
2494   if (is_bw && !is_bw_supported) {
2495     assert(vlen_enc != Assembler::AVX_512bit, "required");
2496     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2497            "XMM register should be 0-15");
2498   }
2499 #endif // ASSERT
2500   switch (elem_bt) {
2501     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2502     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2503     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2504     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2505     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2506     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2507     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2508   }
2509 }
2510 
2511 #ifdef _LP64
2512 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2513   assert(UseAVX >= 2, "required");
2514   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2515   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2516   if ((UseAVX > 2) &&
2517       (!is_bw || VM_Version::supports_avx512bw()) &&
2518       (!is_vl || VM_Version::supports_avx512vl())) {
2519     switch (elem_bt) {
2520       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2521       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2522       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2523       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2524       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2525     }
2526   } else {
2527     assert(vlen_enc != Assembler::AVX_512bit, "required");
2528     assert((dst->encoding() < 16),"XMM register should be 0-15");
2529     switch (elem_bt) {
2530       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2531       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2532       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2533       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2534       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2535       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2536       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2537     }
2538   }
2539 }
2540 #endif
2541 
2542 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2543   switch (to_elem_bt) {
2544     case T_SHORT:
2545       vpmovsxbw(dst, src, vlen_enc);
2546       break;
2547     case T_INT:
2548       vpmovsxbd(dst, src, vlen_enc);
2549       break;
2550     case T_FLOAT:
2551       vpmovsxbd(dst, src, vlen_enc);
2552       vcvtdq2ps(dst, dst, vlen_enc);
2553       break;
2554     case T_LONG:
2555       vpmovsxbq(dst, src, vlen_enc);
2556       break;
2557     case T_DOUBLE: {
2558       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2559       vpmovsxbd(dst, src, mid_vlen_enc);
2560       vcvtdq2pd(dst, dst, vlen_enc);
2561       break;
2562     }
2563     default:
2564       fatal("Unsupported type %s", type2name(to_elem_bt));
2565       break;
2566   }
2567 }
2568 
2569 //-------------------------------------------------------------------------------------------
2570 
2571 // IndexOf for constant substrings with size >= 8 chars
2572 // which don't need to be loaded through stack.
2573 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2574                                          Register cnt1, Register cnt2,
2575                                          int int_cnt2,  Register result,
2576                                          XMMRegister vec, Register tmp,
2577                                          int ae) {
2578   ShortBranchVerifier sbv(this);
2579   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2580   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2581 
2582   // This method uses the pcmpestri instruction with bound registers
2583   //   inputs:
2584   //     xmm - substring
2585   //     rax - substring length (elements count)
2586   //     mem - scanned string
2587   //     rdx - string length (elements count)
2588   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2589   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2590   //   outputs:
2591   //     rcx - matched index in string
2592   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2593   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2594   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2595   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2596   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2597 
2598   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2599         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2600         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2601 
2602   // Note, inline_string_indexOf() generates checks:
2603   // if (substr.count > string.count) return -1;
2604   // if (substr.count == 0) return 0;
2605   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2606 
2607   // Load substring.
2608   if (ae == StrIntrinsicNode::UL) {
2609     pmovzxbw(vec, Address(str2, 0));
2610   } else {
2611     movdqu(vec, Address(str2, 0));
2612   }
2613   movl(cnt2, int_cnt2);
2614   movptr(result, str1); // string addr
2615 
2616   if (int_cnt2 > stride) {
2617     jmpb(SCAN_TO_SUBSTR);
2618 
2619     // Reload substr for rescan, this code
2620     // is executed only for large substrings (> 8 chars)
2621     bind(RELOAD_SUBSTR);
2622     if (ae == StrIntrinsicNode::UL) {
2623       pmovzxbw(vec, Address(str2, 0));
2624     } else {
2625       movdqu(vec, Address(str2, 0));
2626     }
2627     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2628 
2629     bind(RELOAD_STR);
2630     // We came here after the beginning of the substring was
2631     // matched but the rest of it was not so we need to search
2632     // again. Start from the next element after the previous match.
2633 
2634     // cnt2 is number of substring reminding elements and
2635     // cnt1 is number of string reminding elements when cmp failed.
2636     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2637     subl(cnt1, cnt2);
2638     addl(cnt1, int_cnt2);
2639     movl(cnt2, int_cnt2); // Now restore cnt2
2640 
2641     decrementl(cnt1);     // Shift to next element
2642     cmpl(cnt1, cnt2);
2643     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2644 
2645     addptr(result, (1<<scale1));
2646 
2647   } // (int_cnt2 > 8)
2648 
2649   // Scan string for start of substr in 16-byte vectors
2650   bind(SCAN_TO_SUBSTR);
2651   pcmpestri(vec, Address(result, 0), mode);
2652   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2653   subl(cnt1, stride);
2654   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2655   cmpl(cnt1, cnt2);
2656   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2657   addptr(result, 16);
2658   jmpb(SCAN_TO_SUBSTR);
2659 
2660   // Found a potential substr
2661   bind(FOUND_CANDIDATE);
2662   // Matched whole vector if first element matched (tmp(rcx) == 0).
2663   if (int_cnt2 == stride) {
2664     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2665   } else { // int_cnt2 > 8
2666     jccb(Assembler::overflow, FOUND_SUBSTR);
2667   }
2668   // After pcmpestri tmp(rcx) contains matched element index
2669   // Compute start addr of substr
2670   lea(result, Address(result, tmp, scale1));
2671 
2672   // Make sure string is still long enough
2673   subl(cnt1, tmp);
2674   cmpl(cnt1, cnt2);
2675   if (int_cnt2 == stride) {
2676     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2677   } else { // int_cnt2 > 8
2678     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2679   }
2680   // Left less then substring.
2681 
2682   bind(RET_NOT_FOUND);
2683   movl(result, -1);
2684   jmp(EXIT);
2685 
2686   if (int_cnt2 > stride) {
2687     // This code is optimized for the case when whole substring
2688     // is matched if its head is matched.
2689     bind(MATCH_SUBSTR_HEAD);
2690     pcmpestri(vec, Address(result, 0), mode);
2691     // Reload only string if does not match
2692     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2693 
2694     Label CONT_SCAN_SUBSTR;
2695     // Compare the rest of substring (> 8 chars).
2696     bind(FOUND_SUBSTR);
2697     // First 8 chars are already matched.
2698     negptr(cnt2);
2699     addptr(cnt2, stride);
2700 
2701     bind(SCAN_SUBSTR);
2702     subl(cnt1, stride);
2703     cmpl(cnt2, -stride); // Do not read beyond substring
2704     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2705     // Back-up strings to avoid reading beyond substring:
2706     // cnt1 = cnt1 - cnt2 + 8
2707     addl(cnt1, cnt2); // cnt2 is negative
2708     addl(cnt1, stride);
2709     movl(cnt2, stride); negptr(cnt2);
2710     bind(CONT_SCAN_SUBSTR);
2711     if (int_cnt2 < (int)G) {
2712       int tail_off1 = int_cnt2<<scale1;
2713       int tail_off2 = int_cnt2<<scale2;
2714       if (ae == StrIntrinsicNode::UL) {
2715         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2716       } else {
2717         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2718       }
2719       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2720     } else {
2721       // calculate index in register to avoid integer overflow (int_cnt2*2)
2722       movl(tmp, int_cnt2);
2723       addptr(tmp, cnt2);
2724       if (ae == StrIntrinsicNode::UL) {
2725         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2726       } else {
2727         movdqu(vec, Address(str2, tmp, scale2, 0));
2728       }
2729       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2730     }
2731     // Need to reload strings pointers if not matched whole vector
2732     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2733     addptr(cnt2, stride);
2734     jcc(Assembler::negative, SCAN_SUBSTR);
2735     // Fall through if found full substring
2736 
2737   } // (int_cnt2 > 8)
2738 
2739   bind(RET_FOUND);
2740   // Found result if we matched full small substring.
2741   // Compute substr offset
2742   subptr(result, str1);
2743   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2744     shrl(result, 1); // index
2745   }
2746   bind(EXIT);
2747 
2748 } // string_indexofC8
2749 
2750 // Small strings are loaded through stack if they cross page boundary.
2751 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2752                                        Register cnt1, Register cnt2,
2753                                        int int_cnt2,  Register result,
2754                                        XMMRegister vec, Register tmp,
2755                                        int ae) {
2756   ShortBranchVerifier sbv(this);
2757   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2758   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2759 
2760   //
2761   // int_cnt2 is length of small (< 8 chars) constant substring
2762   // or (-1) for non constant substring in which case its length
2763   // is in cnt2 register.
2764   //
2765   // Note, inline_string_indexOf() generates checks:
2766   // if (substr.count > string.count) return -1;
2767   // if (substr.count == 0) return 0;
2768   //
2769   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2770   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2771   // This method uses the pcmpestri instruction with bound registers
2772   //   inputs:
2773   //     xmm - substring
2774   //     rax - substring length (elements count)
2775   //     mem - scanned string
2776   //     rdx - string length (elements count)
2777   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2778   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2779   //   outputs:
2780   //     rcx - matched index in string
2781   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2782   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2783   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2784   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2785 
2786   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2787         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2788         FOUND_CANDIDATE;
2789 
2790   { //========================================================
2791     // We don't know where these strings are located
2792     // and we can't read beyond them. Load them through stack.
2793     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2794 
2795     movptr(tmp, rsp); // save old SP
2796 
2797     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2798       if (int_cnt2 == (1>>scale2)) { // One byte
2799         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2800         load_unsigned_byte(result, Address(str2, 0));
2801         movdl(vec, result); // move 32 bits
2802       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2803         // Not enough header space in 32-bit VM: 12+3 = 15.
2804         movl(result, Address(str2, -1));
2805         shrl(result, 8);
2806         movdl(vec, result); // move 32 bits
2807       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2808         load_unsigned_short(result, Address(str2, 0));
2809         movdl(vec, result); // move 32 bits
2810       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2811         movdl(vec, Address(str2, 0)); // move 32 bits
2812       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2813         movq(vec, Address(str2, 0));  // move 64 bits
2814       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2815         // Array header size is 12 bytes in 32-bit VM
2816         // + 6 bytes for 3 chars == 18 bytes,
2817         // enough space to load vec and shift.
2818         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2819         if (ae == StrIntrinsicNode::UL) {
2820           int tail_off = int_cnt2-8;
2821           pmovzxbw(vec, Address(str2, tail_off));
2822           psrldq(vec, -2*tail_off);
2823         }
2824         else {
2825           int tail_off = int_cnt2*(1<<scale2);
2826           movdqu(vec, Address(str2, tail_off-16));
2827           psrldq(vec, 16-tail_off);
2828         }
2829       }
2830     } else { // not constant substring
2831       cmpl(cnt2, stride);
2832       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2833 
2834       // We can read beyond string if srt+16 does not cross page boundary
2835       // since heaps are aligned and mapped by pages.
2836       assert(os::vm_page_size() < (int)G, "default page should be small");
2837       movl(result, str2); // We need only low 32 bits
2838       andl(result, ((int)os::vm_page_size()-1));
2839       cmpl(result, ((int)os::vm_page_size()-16));
2840       jccb(Assembler::belowEqual, CHECK_STR);
2841 
2842       // Move small strings to stack to allow load 16 bytes into vec.
2843       subptr(rsp, 16);
2844       int stk_offset = wordSize-(1<<scale2);
2845       push(cnt2);
2846 
2847       bind(COPY_SUBSTR);
2848       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2849         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2850         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2851       } else if (ae == StrIntrinsicNode::UU) {
2852         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2853         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2854       }
2855       decrement(cnt2);
2856       jccb(Assembler::notZero, COPY_SUBSTR);
2857 
2858       pop(cnt2);
2859       movptr(str2, rsp);  // New substring address
2860     } // non constant
2861 
2862     bind(CHECK_STR);
2863     cmpl(cnt1, stride);
2864     jccb(Assembler::aboveEqual, BIG_STRINGS);
2865 
2866     // Check cross page boundary.
2867     movl(result, str1); // We need only low 32 bits
2868     andl(result, ((int)os::vm_page_size()-1));
2869     cmpl(result, ((int)os::vm_page_size()-16));
2870     jccb(Assembler::belowEqual, BIG_STRINGS);
2871 
2872     subptr(rsp, 16);
2873     int stk_offset = -(1<<scale1);
2874     if (int_cnt2 < 0) { // not constant
2875       push(cnt2);
2876       stk_offset += wordSize;
2877     }
2878     movl(cnt2, cnt1);
2879 
2880     bind(COPY_STR);
2881     if (ae == StrIntrinsicNode::LL) {
2882       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2883       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2884     } else {
2885       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2886       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2887     }
2888     decrement(cnt2);
2889     jccb(Assembler::notZero, COPY_STR);
2890 
2891     if (int_cnt2 < 0) { // not constant
2892       pop(cnt2);
2893     }
2894     movptr(str1, rsp);  // New string address
2895 
2896     bind(BIG_STRINGS);
2897     // Load substring.
2898     if (int_cnt2 < 0) { // -1
2899       if (ae == StrIntrinsicNode::UL) {
2900         pmovzxbw(vec, Address(str2, 0));
2901       } else {
2902         movdqu(vec, Address(str2, 0));
2903       }
2904       push(cnt2);       // substr count
2905       push(str2);       // substr addr
2906       push(str1);       // string addr
2907     } else {
2908       // Small (< 8 chars) constant substrings are loaded already.
2909       movl(cnt2, int_cnt2);
2910     }
2911     push(tmp);  // original SP
2912 
2913   } // Finished loading
2914 
2915   //========================================================
2916   // Start search
2917   //
2918 
2919   movptr(result, str1); // string addr
2920 
2921   if (int_cnt2  < 0) {  // Only for non constant substring
2922     jmpb(SCAN_TO_SUBSTR);
2923 
2924     // SP saved at sp+0
2925     // String saved at sp+1*wordSize
2926     // Substr saved at sp+2*wordSize
2927     // Substr count saved at sp+3*wordSize
2928 
2929     // Reload substr for rescan, this code
2930     // is executed only for large substrings (> 8 chars)
2931     bind(RELOAD_SUBSTR);
2932     movptr(str2, Address(rsp, 2*wordSize));
2933     movl(cnt2, Address(rsp, 3*wordSize));
2934     if (ae == StrIntrinsicNode::UL) {
2935       pmovzxbw(vec, Address(str2, 0));
2936     } else {
2937       movdqu(vec, Address(str2, 0));
2938     }
2939     // We came here after the beginning of the substring was
2940     // matched but the rest of it was not so we need to search
2941     // again. Start from the next element after the previous match.
2942     subptr(str1, result); // Restore counter
2943     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2944       shrl(str1, 1);
2945     }
2946     addl(cnt1, str1);
2947     decrementl(cnt1);   // Shift to next element
2948     cmpl(cnt1, cnt2);
2949     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2950 
2951     addptr(result, (1<<scale1));
2952   } // non constant
2953 
2954   // Scan string for start of substr in 16-byte vectors
2955   bind(SCAN_TO_SUBSTR);
2956   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2957   pcmpestri(vec, Address(result, 0), mode);
2958   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2959   subl(cnt1, stride);
2960   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2961   cmpl(cnt1, cnt2);
2962   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2963   addptr(result, 16);
2964 
2965   bind(ADJUST_STR);
2966   cmpl(cnt1, stride); // Do not read beyond string
2967   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2968   // Back-up string to avoid reading beyond string.
2969   lea(result, Address(result, cnt1, scale1, -16));
2970   movl(cnt1, stride);
2971   jmpb(SCAN_TO_SUBSTR);
2972 
2973   // Found a potential substr
2974   bind(FOUND_CANDIDATE);
2975   // After pcmpestri tmp(rcx) contains matched element index
2976 
2977   // Make sure string is still long enough
2978   subl(cnt1, tmp);
2979   cmpl(cnt1, cnt2);
2980   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
2981   // Left less then substring.
2982 
2983   bind(RET_NOT_FOUND);
2984   movl(result, -1);
2985   jmp(CLEANUP);
2986 
2987   bind(FOUND_SUBSTR);
2988   // Compute start addr of substr
2989   lea(result, Address(result, tmp, scale1));
2990   if (int_cnt2 > 0) { // Constant substring
2991     // Repeat search for small substring (< 8 chars)
2992     // from new point without reloading substring.
2993     // Have to check that we don't read beyond string.
2994     cmpl(tmp, stride-int_cnt2);
2995     jccb(Assembler::greater, ADJUST_STR);
2996     // Fall through if matched whole substring.
2997   } else { // non constant
2998     assert(int_cnt2 == -1, "should be != 0");
2999 
3000     addl(tmp, cnt2);
3001     // Found result if we matched whole substring.
3002     cmpl(tmp, stride);
3003     jcc(Assembler::lessEqual, RET_FOUND);
3004 
3005     // Repeat search for small substring (<= 8 chars)
3006     // from new point 'str1' without reloading substring.
3007     cmpl(cnt2, stride);
3008     // Have to check that we don't read beyond string.
3009     jccb(Assembler::lessEqual, ADJUST_STR);
3010 
3011     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3012     // Compare the rest of substring (> 8 chars).
3013     movptr(str1, result);
3014 
3015     cmpl(tmp, cnt2);
3016     // First 8 chars are already matched.
3017     jccb(Assembler::equal, CHECK_NEXT);
3018 
3019     bind(SCAN_SUBSTR);
3020     pcmpestri(vec, Address(str1, 0), mode);
3021     // Need to reload strings pointers if not matched whole vector
3022     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3023 
3024     bind(CHECK_NEXT);
3025     subl(cnt2, stride);
3026     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3027     addptr(str1, 16);
3028     if (ae == StrIntrinsicNode::UL) {
3029       addptr(str2, 8);
3030     } else {
3031       addptr(str2, 16);
3032     }
3033     subl(cnt1, stride);
3034     cmpl(cnt2, stride); // Do not read beyond substring
3035     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3036     // Back-up strings to avoid reading beyond substring.
3037 
3038     if (ae == StrIntrinsicNode::UL) {
3039       lea(str2, Address(str2, cnt2, scale2, -8));
3040       lea(str1, Address(str1, cnt2, scale1, -16));
3041     } else {
3042       lea(str2, Address(str2, cnt2, scale2, -16));
3043       lea(str1, Address(str1, cnt2, scale1, -16));
3044     }
3045     subl(cnt1, cnt2);
3046     movl(cnt2, stride);
3047     addl(cnt1, stride);
3048     bind(CONT_SCAN_SUBSTR);
3049     if (ae == StrIntrinsicNode::UL) {
3050       pmovzxbw(vec, Address(str2, 0));
3051     } else {
3052       movdqu(vec, Address(str2, 0));
3053     }
3054     jmp(SCAN_SUBSTR);
3055 
3056     bind(RET_FOUND_LONG);
3057     movptr(str1, Address(rsp, wordSize));
3058   } // non constant
3059 
3060   bind(RET_FOUND);
3061   // Compute substr offset
3062   subptr(result, str1);
3063   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3064     shrl(result, 1); // index
3065   }
3066   bind(CLEANUP);
3067   pop(rsp); // restore SP
3068 
3069 } // string_indexof
3070 
3071 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3072                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3073   ShortBranchVerifier sbv(this);
3074   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3075 
3076   int stride = 8;
3077 
3078   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3079         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3080         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3081         FOUND_SEQ_CHAR, DONE_LABEL;
3082 
3083   movptr(result, str1);
3084   if (UseAVX >= 2) {
3085     cmpl(cnt1, stride);
3086     jcc(Assembler::less, SCAN_TO_CHAR);
3087     cmpl(cnt1, 2*stride);
3088     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3089     movdl(vec1, ch);
3090     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3091     vpxor(vec2, vec2);
3092     movl(tmp, cnt1);
3093     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3094     andl(cnt1,0x0000000F);  //tail count (in chars)
3095 
3096     bind(SCAN_TO_16_CHAR_LOOP);
3097     vmovdqu(vec3, Address(result, 0));
3098     vpcmpeqw(vec3, vec3, vec1, 1);
3099     vptest(vec2, vec3);
3100     jcc(Assembler::carryClear, FOUND_CHAR);
3101     addptr(result, 32);
3102     subl(tmp, 2*stride);
3103     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3104     jmp(SCAN_TO_8_CHAR);
3105     bind(SCAN_TO_8_CHAR_INIT);
3106     movdl(vec1, ch);
3107     pshuflw(vec1, vec1, 0x00);
3108     pshufd(vec1, vec1, 0);
3109     pxor(vec2, vec2);
3110   }
3111   bind(SCAN_TO_8_CHAR);
3112   cmpl(cnt1, stride);
3113   jcc(Assembler::less, SCAN_TO_CHAR);
3114   if (UseAVX < 2) {
3115     movdl(vec1, ch);
3116     pshuflw(vec1, vec1, 0x00);
3117     pshufd(vec1, vec1, 0);
3118     pxor(vec2, vec2);
3119   }
3120   movl(tmp, cnt1);
3121   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3122   andl(cnt1,0x00000007);  //tail count (in chars)
3123 
3124   bind(SCAN_TO_8_CHAR_LOOP);
3125   movdqu(vec3, Address(result, 0));
3126   pcmpeqw(vec3, vec1);
3127   ptest(vec2, vec3);
3128   jcc(Assembler::carryClear, FOUND_CHAR);
3129   addptr(result, 16);
3130   subl(tmp, stride);
3131   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3132   bind(SCAN_TO_CHAR);
3133   testl(cnt1, cnt1);
3134   jcc(Assembler::zero, RET_NOT_FOUND);
3135   bind(SCAN_TO_CHAR_LOOP);
3136   load_unsigned_short(tmp, Address(result, 0));
3137   cmpl(ch, tmp);
3138   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3139   addptr(result, 2);
3140   subl(cnt1, 1);
3141   jccb(Assembler::zero, RET_NOT_FOUND);
3142   jmp(SCAN_TO_CHAR_LOOP);
3143 
3144   bind(RET_NOT_FOUND);
3145   movl(result, -1);
3146   jmpb(DONE_LABEL);
3147 
3148   bind(FOUND_CHAR);
3149   if (UseAVX >= 2) {
3150     vpmovmskb(tmp, vec3);
3151   } else {
3152     pmovmskb(tmp, vec3);
3153   }
3154   bsfl(ch, tmp);
3155   addptr(result, ch);
3156 
3157   bind(FOUND_SEQ_CHAR);
3158   subptr(result, str1);
3159   shrl(result, 1);
3160 
3161   bind(DONE_LABEL);
3162 } // string_indexof_char
3163 
3164 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3165                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3166   ShortBranchVerifier sbv(this);
3167   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3168 
3169   int stride = 16;
3170 
3171   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3172         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3173         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3174         FOUND_SEQ_CHAR, DONE_LABEL;
3175 
3176   movptr(result, str1);
3177   if (UseAVX >= 2) {
3178     cmpl(cnt1, stride);
3179     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3180     cmpl(cnt1, stride*2);
3181     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3182     movdl(vec1, ch);
3183     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3184     vpxor(vec2, vec2);
3185     movl(tmp, cnt1);
3186     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3187     andl(cnt1,0x0000001F);  //tail count (in chars)
3188 
3189     bind(SCAN_TO_32_CHAR_LOOP);
3190     vmovdqu(vec3, Address(result, 0));
3191     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3192     vptest(vec2, vec3);
3193     jcc(Assembler::carryClear, FOUND_CHAR);
3194     addptr(result, 32);
3195     subl(tmp, stride*2);
3196     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3197     jmp(SCAN_TO_16_CHAR);
3198 
3199     bind(SCAN_TO_16_CHAR_INIT);
3200     movdl(vec1, ch);
3201     pxor(vec2, vec2);
3202     pshufb(vec1, vec2);
3203   }
3204 
3205   bind(SCAN_TO_16_CHAR);
3206   cmpl(cnt1, stride);
3207   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3208   if (UseAVX < 2) {
3209     movdl(vec1, ch);
3210     pxor(vec2, vec2);
3211     pshufb(vec1, vec2);
3212   }
3213   movl(tmp, cnt1);
3214   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3215   andl(cnt1,0x0000000F);  //tail count (in bytes)
3216 
3217   bind(SCAN_TO_16_CHAR_LOOP);
3218   movdqu(vec3, Address(result, 0));
3219   pcmpeqb(vec3, vec1);
3220   ptest(vec2, vec3);
3221   jcc(Assembler::carryClear, FOUND_CHAR);
3222   addptr(result, 16);
3223   subl(tmp, stride);
3224   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3225 
3226   bind(SCAN_TO_CHAR_INIT);
3227   testl(cnt1, cnt1);
3228   jcc(Assembler::zero, RET_NOT_FOUND);
3229   bind(SCAN_TO_CHAR_LOOP);
3230   load_unsigned_byte(tmp, Address(result, 0));
3231   cmpl(ch, tmp);
3232   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3233   addptr(result, 1);
3234   subl(cnt1, 1);
3235   jccb(Assembler::zero, RET_NOT_FOUND);
3236   jmp(SCAN_TO_CHAR_LOOP);
3237 
3238   bind(RET_NOT_FOUND);
3239   movl(result, -1);
3240   jmpb(DONE_LABEL);
3241 
3242   bind(FOUND_CHAR);
3243   if (UseAVX >= 2) {
3244     vpmovmskb(tmp, vec3);
3245   } else {
3246     pmovmskb(tmp, vec3);
3247   }
3248   bsfl(ch, tmp);
3249   addptr(result, ch);
3250 
3251   bind(FOUND_SEQ_CHAR);
3252   subptr(result, str1);
3253 
3254   bind(DONE_LABEL);
3255 } // stringL_indexof_char
3256 
3257 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3258   switch (eltype) {
3259   case T_BOOLEAN: return sizeof(jboolean);
3260   case T_BYTE:  return sizeof(jbyte);
3261   case T_SHORT: return sizeof(jshort);
3262   case T_CHAR:  return sizeof(jchar);
3263   case T_INT:   return sizeof(jint);
3264   default:
3265     ShouldNotReachHere();
3266     return -1;
3267   }
3268 }
3269 
3270 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3271   switch (eltype) {
3272   // T_BOOLEAN used as surrogate for unsigned byte
3273   case T_BOOLEAN: movzbl(dst, src);   break;
3274   case T_BYTE:    movsbl(dst, src);   break;
3275   case T_SHORT:   movswl(dst, src);   break;
3276   case T_CHAR:    movzwl(dst, src);   break;
3277   case T_INT:     movl(dst, src);     break;
3278   default:
3279     ShouldNotReachHere();
3280   }
3281 }
3282 
3283 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3284   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3285 }
3286 
3287 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3288   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3289 }
3290 
3291 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3292   const int vlen = Assembler::AVX_256bit;
3293   switch (eltype) {
3294   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3295   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3296   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3297   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3298   case T_INT:
3299     // do nothing
3300     break;
3301   default:
3302     ShouldNotReachHere();
3303   }
3304 }
3305 
3306 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3307                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3308                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3309                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3310                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3311                                         BasicType eltype) {
3312   ShortBranchVerifier sbv(this);
3313   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3314   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3315   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3316 
3317   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3318         SHORT_UNROLLED_LOOP_EXIT,
3319         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3320         UNROLLED_VECTOR_LOOP_BEGIN,
3321         END;
3322   switch (eltype) {
3323   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3324   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3325   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3326   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3327   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3328   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3329   }
3330 
3331   // For "renaming" for readibility of the code
3332   XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3333               vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3334               vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3335 
3336   const int elsize = arrays_hashcode_elsize(eltype);
3337 
3338   /*
3339     if (cnt1 >= 2) {
3340       if (cnt1 >= 32) {
3341         UNROLLED VECTOR LOOP
3342       }
3343       UNROLLED SCALAR LOOP
3344     }
3345     SINGLE SCALAR
3346    */
3347 
3348   cmpl(cnt1, 32);
3349   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3350 
3351   // cnt1 >= 32 && generate_vectorized_loop
3352   xorl(index, index);
3353 
3354   // vresult = IntVector.zero(I256);
3355   for (int idx = 0; idx < 4; idx++) {
3356     vpxor(vresult[idx], vresult[idx]);
3357   }
3358   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3359   Register bound = tmp2;
3360   Register next = tmp3;
3361   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3362   movl(next, Address(tmp2, 0));
3363   movdl(vnext, next);
3364   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3365 
3366   // index = 0;
3367   // bound = cnt1 & ~(32 - 1);
3368   movl(bound, cnt1);
3369   andl(bound, ~(32 - 1));
3370   // for (; index < bound; index += 32) {
3371   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3372   // result *= next;
3373   imull(result, next);
3374   // loop fission to upfront the cost of fetching from memory, OOO execution
3375   // can then hopefully do a better job of prefetching
3376   for (int idx = 0; idx < 4; idx++) {
3377     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3378   }
3379   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3380   for (int idx = 0; idx < 4; idx++) {
3381     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3382     arrays_hashcode_elvcast(vtmp[idx], eltype);
3383     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3384   }
3385   // index += 32;
3386   addl(index, 32);
3387   // index < bound;
3388   cmpl(index, bound);
3389   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3390   // }
3391 
3392   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3393   subl(cnt1, bound);
3394   // release bound
3395 
3396   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3397   for (int idx = 0; idx < 4; idx++) {
3398     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3399     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3400     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3401   }
3402   // result += vresult.reduceLanes(ADD);
3403   for (int idx = 0; idx < 4; idx++) {
3404     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3405   }
3406 
3407   // } else if (cnt1 < 32) {
3408 
3409   bind(SHORT_UNROLLED_BEGIN);
3410   // int i = 1;
3411   movl(index, 1);
3412   cmpl(index, cnt1);
3413   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3414 
3415   // for (; i < cnt1 ; i += 2) {
3416   bind(SHORT_UNROLLED_LOOP_BEGIN);
3417   movl(tmp3, 961);
3418   imull(result, tmp3);
3419   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3420   movl(tmp3, tmp2);
3421   shll(tmp3, 5);
3422   subl(tmp3, tmp2);
3423   addl(result, tmp3);
3424   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3425   addl(result, tmp3);
3426   addl(index, 2);
3427   cmpl(index, cnt1);
3428   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3429 
3430   // }
3431   // if (i >= cnt1) {
3432   bind(SHORT_UNROLLED_LOOP_EXIT);
3433   jccb(Assembler::greater, END);
3434   movl(tmp2, result);
3435   shll(result, 5);
3436   subl(result, tmp2);
3437   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3438   addl(result, tmp3);
3439   // }
3440   bind(END);
3441 
3442   BLOCK_COMMENT("} // arrays_hashcode");
3443 
3444 } // arrays_hashcode
3445 
3446 // helper function for string_compare
3447 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3448                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3449                                            Address::ScaleFactor scale2, Register index, int ae) {
3450   if (ae == StrIntrinsicNode::LL) {
3451     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3452     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3453   } else if (ae == StrIntrinsicNode::UU) {
3454     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3455     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3456   } else {
3457     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3458     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3459   }
3460 }
3461 
3462 // Compare strings, used for char[] and byte[].
3463 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3464                                        Register cnt1, Register cnt2, Register result,
3465                                        XMMRegister vec1, int ae, KRegister mask) {
3466   ShortBranchVerifier sbv(this);
3467   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3468   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3469   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3470   int stride2x2 = 0x40;
3471   Address::ScaleFactor scale = Address::no_scale;
3472   Address::ScaleFactor scale1 = Address::no_scale;
3473   Address::ScaleFactor scale2 = Address::no_scale;
3474 
3475   if (ae != StrIntrinsicNode::LL) {
3476     stride2x2 = 0x20;
3477   }
3478 
3479   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3480     shrl(cnt2, 1);
3481   }
3482   // Compute the minimum of the string lengths and the
3483   // difference of the string lengths (stack).
3484   // Do the conditional move stuff
3485   movl(result, cnt1);
3486   subl(cnt1, cnt2);
3487   push(cnt1);
3488   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3489 
3490   // Is the minimum length zero?
3491   testl(cnt2, cnt2);
3492   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3493   if (ae == StrIntrinsicNode::LL) {
3494     // Load first bytes
3495     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3496     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3497   } else if (ae == StrIntrinsicNode::UU) {
3498     // Load first characters
3499     load_unsigned_short(result, Address(str1, 0));
3500     load_unsigned_short(cnt1, Address(str2, 0));
3501   } else {
3502     load_unsigned_byte(result, Address(str1, 0));
3503     load_unsigned_short(cnt1, Address(str2, 0));
3504   }
3505   subl(result, cnt1);
3506   jcc(Assembler::notZero,  POP_LABEL);
3507 
3508   if (ae == StrIntrinsicNode::UU) {
3509     // Divide length by 2 to get number of chars
3510     shrl(cnt2, 1);
3511   }
3512   cmpl(cnt2, 1);
3513   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3514 
3515   // Check if the strings start at the same location and setup scale and stride
3516   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3517     cmpptr(str1, str2);
3518     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3519     if (ae == StrIntrinsicNode::LL) {
3520       scale = Address::times_1;
3521       stride = 16;
3522     } else {
3523       scale = Address::times_2;
3524       stride = 8;
3525     }
3526   } else {
3527     scale1 = Address::times_1;
3528     scale2 = Address::times_2;
3529     // scale not used
3530     stride = 8;
3531   }
3532 
3533   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3534     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3535     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3536     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3537     Label COMPARE_TAIL_LONG;
3538     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3539 
3540     int pcmpmask = 0x19;
3541     if (ae == StrIntrinsicNode::LL) {
3542       pcmpmask &= ~0x01;
3543     }
3544 
3545     // Setup to compare 16-chars (32-bytes) vectors,
3546     // start from first character again because it has aligned address.
3547     if (ae == StrIntrinsicNode::LL) {
3548       stride2 = 32;
3549     } else {
3550       stride2 = 16;
3551     }
3552     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3553       adr_stride = stride << scale;
3554     } else {
3555       adr_stride1 = 8;  //stride << scale1;
3556       adr_stride2 = 16; //stride << scale2;
3557     }
3558 
3559     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3560     // rax and rdx are used by pcmpestri as elements counters
3561     movl(result, cnt2);
3562     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3563     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3564 
3565     // fast path : compare first 2 8-char vectors.
3566     bind(COMPARE_16_CHARS);
3567     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3568       movdqu(vec1, Address(str1, 0));
3569     } else {
3570       pmovzxbw(vec1, Address(str1, 0));
3571     }
3572     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3573     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3574 
3575     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3576       movdqu(vec1, Address(str1, adr_stride));
3577       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3578     } else {
3579       pmovzxbw(vec1, Address(str1, adr_stride1));
3580       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3581     }
3582     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3583     addl(cnt1, stride);
3584 
3585     // Compare the characters at index in cnt1
3586     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3587     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3588     subl(result, cnt2);
3589     jmp(POP_LABEL);
3590 
3591     // Setup the registers to start vector comparison loop
3592     bind(COMPARE_WIDE_VECTORS);
3593     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3594       lea(str1, Address(str1, result, scale));
3595       lea(str2, Address(str2, result, scale));
3596     } else {
3597       lea(str1, Address(str1, result, scale1));
3598       lea(str2, Address(str2, result, scale2));
3599     }
3600     subl(result, stride2);
3601     subl(cnt2, stride2);
3602     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3603     negptr(result);
3604 
3605     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3606     bind(COMPARE_WIDE_VECTORS_LOOP);
3607 
3608 #ifdef _LP64
3609     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3610       cmpl(cnt2, stride2x2);
3611       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3612       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3613       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3614 
3615       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3616       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3617         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3618         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3619       } else {
3620         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3621         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3622       }
3623       kortestql(mask, mask);
3624       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3625       addptr(result, stride2x2);  // update since we already compared at this addr
3626       subl(cnt2, stride2x2);      // and sub the size too
3627       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3628 
3629       vpxor(vec1, vec1);
3630       jmpb(COMPARE_WIDE_TAIL);
3631     }//if (VM_Version::supports_avx512vlbw())
3632 #endif // _LP64
3633 
3634 
3635     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3636     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3637       vmovdqu(vec1, Address(str1, result, scale));
3638       vpxor(vec1, Address(str2, result, scale));
3639     } else {
3640       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3641       vpxor(vec1, Address(str2, result, scale2));
3642     }
3643     vptest(vec1, vec1);
3644     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3645     addptr(result, stride2);
3646     subl(cnt2, stride2);
3647     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3648     // clean upper bits of YMM registers
3649     vpxor(vec1, vec1);
3650 
3651     // compare wide vectors tail
3652     bind(COMPARE_WIDE_TAIL);
3653     testptr(result, result);
3654     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3655 
3656     movl(result, stride2);
3657     movl(cnt2, result);
3658     negptr(result);
3659     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3660 
3661     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3662     bind(VECTOR_NOT_EQUAL);
3663     // clean upper bits of YMM registers
3664     vpxor(vec1, vec1);
3665     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3666       lea(str1, Address(str1, result, scale));
3667       lea(str2, Address(str2, result, scale));
3668     } else {
3669       lea(str1, Address(str1, result, scale1));
3670       lea(str2, Address(str2, result, scale2));
3671     }
3672     jmp(COMPARE_16_CHARS);
3673 
3674     // Compare tail chars, length between 1 to 15 chars
3675     bind(COMPARE_TAIL_LONG);
3676     movl(cnt2, result);
3677     cmpl(cnt2, stride);
3678     jcc(Assembler::less, COMPARE_SMALL_STR);
3679 
3680     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3681       movdqu(vec1, Address(str1, 0));
3682     } else {
3683       pmovzxbw(vec1, Address(str1, 0));
3684     }
3685     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3686     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3687     subptr(cnt2, stride);
3688     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3689     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3690       lea(str1, Address(str1, result, scale));
3691       lea(str2, Address(str2, result, scale));
3692     } else {
3693       lea(str1, Address(str1, result, scale1));
3694       lea(str2, Address(str2, result, scale2));
3695     }
3696     negptr(cnt2);
3697     jmpb(WHILE_HEAD_LABEL);
3698 
3699     bind(COMPARE_SMALL_STR);
3700   } else if (UseSSE42Intrinsics) {
3701     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3702     int pcmpmask = 0x19;
3703     // Setup to compare 8-char (16-byte) vectors,
3704     // start from first character again because it has aligned address.
3705     movl(result, cnt2);
3706     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3707     if (ae == StrIntrinsicNode::LL) {
3708       pcmpmask &= ~0x01;
3709     }
3710     jcc(Assembler::zero, COMPARE_TAIL);
3711     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3712       lea(str1, Address(str1, result, scale));
3713       lea(str2, Address(str2, result, scale));
3714     } else {
3715       lea(str1, Address(str1, result, scale1));
3716       lea(str2, Address(str2, result, scale2));
3717     }
3718     negptr(result);
3719 
3720     // pcmpestri
3721     //   inputs:
3722     //     vec1- substring
3723     //     rax - negative string length (elements count)
3724     //     mem - scanned string
3725     //     rdx - string length (elements count)
3726     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3727     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3728     //   outputs:
3729     //     rcx - first mismatched element index
3730     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3731 
3732     bind(COMPARE_WIDE_VECTORS);
3733     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3734       movdqu(vec1, Address(str1, result, scale));
3735       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3736     } else {
3737       pmovzxbw(vec1, Address(str1, result, scale1));
3738       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3739     }
3740     // After pcmpestri cnt1(rcx) contains mismatched element index
3741 
3742     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3743     addptr(result, stride);
3744     subptr(cnt2, stride);
3745     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3746 
3747     // compare wide vectors tail
3748     testptr(result, result);
3749     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3750 
3751     movl(cnt2, stride);
3752     movl(result, stride);
3753     negptr(result);
3754     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3755       movdqu(vec1, Address(str1, result, scale));
3756       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3757     } else {
3758       pmovzxbw(vec1, Address(str1, result, scale1));
3759       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3760     }
3761     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3762 
3763     // Mismatched characters in the vectors
3764     bind(VECTOR_NOT_EQUAL);
3765     addptr(cnt1, result);
3766     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3767     subl(result, cnt2);
3768     jmpb(POP_LABEL);
3769 
3770     bind(COMPARE_TAIL); // limit is zero
3771     movl(cnt2, result);
3772     // Fallthru to tail compare
3773   }
3774   // Shift str2 and str1 to the end of the arrays, negate min
3775   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3776     lea(str1, Address(str1, cnt2, scale));
3777     lea(str2, Address(str2, cnt2, scale));
3778   } else {
3779     lea(str1, Address(str1, cnt2, scale1));
3780     lea(str2, Address(str2, cnt2, scale2));
3781   }
3782   decrementl(cnt2);  // first character was compared already
3783   negptr(cnt2);
3784 
3785   // Compare the rest of the elements
3786   bind(WHILE_HEAD_LABEL);
3787   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3788   subl(result, cnt1);
3789   jccb(Assembler::notZero, POP_LABEL);
3790   increment(cnt2);
3791   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3792 
3793   // Strings are equal up to min length.  Return the length difference.
3794   bind(LENGTH_DIFF_LABEL);
3795   pop(result);
3796   if (ae == StrIntrinsicNode::UU) {
3797     // Divide diff by 2 to get number of chars
3798     sarl(result, 1);
3799   }
3800   jmpb(DONE_LABEL);
3801 
3802 #ifdef _LP64
3803   if (VM_Version::supports_avx512vlbw()) {
3804 
3805     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3806 
3807     kmovql(cnt1, mask);
3808     notq(cnt1);
3809     bsfq(cnt2, cnt1);
3810     if (ae != StrIntrinsicNode::LL) {
3811       // Divide diff by 2 to get number of chars
3812       sarl(cnt2, 1);
3813     }
3814     addq(result, cnt2);
3815     if (ae == StrIntrinsicNode::LL) {
3816       load_unsigned_byte(cnt1, Address(str2, result));
3817       load_unsigned_byte(result, Address(str1, result));
3818     } else if (ae == StrIntrinsicNode::UU) {
3819       load_unsigned_short(cnt1, Address(str2, result, scale));
3820       load_unsigned_short(result, Address(str1, result, scale));
3821     } else {
3822       load_unsigned_short(cnt1, Address(str2, result, scale2));
3823       load_unsigned_byte(result, Address(str1, result, scale1));
3824     }
3825     subl(result, cnt1);
3826     jmpb(POP_LABEL);
3827   }//if (VM_Version::supports_avx512vlbw())
3828 #endif // _LP64
3829 
3830   // Discard the stored length difference
3831   bind(POP_LABEL);
3832   pop(cnt1);
3833 
3834   // That's it
3835   bind(DONE_LABEL);
3836   if(ae == StrIntrinsicNode::UL) {
3837     negl(result);
3838   }
3839 
3840 }
3841 
3842 // Search for Non-ASCII character (Negative byte value) in a byte array,
3843 // return the index of the first such character, otherwise the length
3844 // of the array segment searched.
3845 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3846 //   @IntrinsicCandidate
3847 //   public static int countPositives(byte[] ba, int off, int len) {
3848 //     for (int i = off; i < off + len; i++) {
3849 //       if (ba[i] < 0) {
3850 //         return i - off;
3851 //       }
3852 //     }
3853 //     return len;
3854 //   }
3855 void C2_MacroAssembler::count_positives(Register ary1, Register len,
3856   Register result, Register tmp1,
3857   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3858   // rsi: byte array
3859   // rcx: len
3860   // rax: result
3861   ShortBranchVerifier sbv(this);
3862   assert_different_registers(ary1, len, result, tmp1);
3863   assert_different_registers(vec1, vec2);
3864   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3865 
3866   movl(result, len); // copy
3867   // len == 0
3868   testl(len, len);
3869   jcc(Assembler::zero, DONE);
3870 
3871   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3872     VM_Version::supports_avx512vlbw() &&
3873     VM_Version::supports_bmi2()) {
3874 
3875     Label test_64_loop, test_tail, BREAK_LOOP;
3876     Register tmp3_aliased = len;
3877 
3878     movl(tmp1, len);
3879     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3880 
3881     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
3882     andl(len, ~(64 - 1));    // vector count (in chars)
3883     jccb(Assembler::zero, test_tail);
3884 
3885     lea(ary1, Address(ary1, len, Address::times_1));
3886     negptr(len);
3887 
3888     bind(test_64_loop);
3889     // Check whether our 64 elements of size byte contain negatives
3890     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3891     kortestql(mask1, mask1);
3892     jcc(Assembler::notZero, BREAK_LOOP);
3893 
3894     addptr(len, 64);
3895     jccb(Assembler::notZero, test_64_loop);
3896 
3897     bind(test_tail);
3898     // bail out when there is nothing to be done
3899     testl(tmp1, -1);
3900     jcc(Assembler::zero, DONE);
3901 
3902     // ~(~0 << len) applied up to two times (for 32-bit scenario)
3903 #ifdef _LP64
3904     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3905     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3906     notq(tmp3_aliased);
3907     kmovql(mask2, tmp3_aliased);
3908 #else
3909     Label k_init;
3910     jmp(k_init);
3911 
3912     // We could not read 64-bits from a general purpose register thus we move
3913     // data required to compose 64 1's to the instruction stream
3914     // We emit 64 byte wide series of elements from 0..63 which later on would
3915     // be used as a compare targets with tail count contained in tmp1 register.
3916     // Result would be a k register having tmp1 consecutive number or 1
3917     // counting from least significant bit.
3918     address tmp = pc();
3919     emit_int64(0x0706050403020100);
3920     emit_int64(0x0F0E0D0C0B0A0908);
3921     emit_int64(0x1716151413121110);
3922     emit_int64(0x1F1E1D1C1B1A1918);
3923     emit_int64(0x2726252423222120);
3924     emit_int64(0x2F2E2D2C2B2A2928);
3925     emit_int64(0x3736353433323130);
3926     emit_int64(0x3F3E3D3C3B3A3938);
3927 
3928     bind(k_init);
3929     lea(len, InternalAddress(tmp));
3930     // create mask to test for negative byte inside a vector
3931     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3932     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
3933 
3934 #endif
3935     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3936     ktestq(mask1, mask2);
3937     jcc(Assembler::zero, DONE);
3938 
3939     bind(BREAK_LOOP);
3940     // At least one byte in the last 64 bytes is negative.
3941     // Set up to look at the last 64 bytes as if they were a tail
3942     lea(ary1, Address(ary1, len, Address::times_1));
3943     addptr(result, len);
3944     // Ignore the very last byte: if all others are positive,
3945     // it must be negative, so we can skip right to the 2+1 byte
3946     // end comparison at this point
3947     orl(result, 63);
3948     movl(len, 63);
3949     // Fallthru to tail compare
3950   } else {
3951 
3952     if (UseAVX >= 2 && UseSSE >= 2) {
3953       // With AVX2, use 32-byte vector compare
3954       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
3955 
3956       // Compare 32-byte vectors
3957       testl(len, 0xffffffe0);   // vector count (in bytes)
3958       jccb(Assembler::zero, TAIL_START);
3959 
3960       andl(len, 0xffffffe0);
3961       lea(ary1, Address(ary1, len, Address::times_1));
3962       negptr(len);
3963 
3964       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
3965       movdl(vec2, tmp1);
3966       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
3967 
3968       bind(COMPARE_WIDE_VECTORS);
3969       vmovdqu(vec1, Address(ary1, len, Address::times_1));
3970       vptest(vec1, vec2);
3971       jccb(Assembler::notZero, BREAK_LOOP);
3972       addptr(len, 32);
3973       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3974 
3975       testl(result, 0x0000001f);   // any bytes remaining?
3976       jcc(Assembler::zero, DONE);
3977 
3978       // Quick test using the already prepared vector mask
3979       movl(len, result);
3980       andl(len, 0x0000001f);
3981       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
3982       vptest(vec1, vec2);
3983       jcc(Assembler::zero, DONE);
3984       // There are zeros, jump to the tail to determine exactly where
3985       jmpb(TAIL_START);
3986 
3987       bind(BREAK_LOOP);
3988       // At least one byte in the last 32-byte vector is negative.
3989       // Set up to look at the last 32 bytes as if they were a tail
3990       lea(ary1, Address(ary1, len, Address::times_1));
3991       addptr(result, len);
3992       // Ignore the very last byte: if all others are positive,
3993       // it must be negative, so we can skip right to the 2+1 byte
3994       // end comparison at this point
3995       orl(result, 31);
3996       movl(len, 31);
3997       // Fallthru to tail compare
3998     } else if (UseSSE42Intrinsics) {
3999       // With SSE4.2, use double quad vector compare
4000       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4001 
4002       // Compare 16-byte vectors
4003       testl(len, 0xfffffff0);   // vector count (in bytes)
4004       jcc(Assembler::zero, TAIL_START);
4005 
4006       andl(len, 0xfffffff0);
4007       lea(ary1, Address(ary1, len, Address::times_1));
4008       negptr(len);
4009 
4010       movl(tmp1, 0x80808080);
4011       movdl(vec2, tmp1);
4012       pshufd(vec2, vec2, 0);
4013 
4014       bind(COMPARE_WIDE_VECTORS);
4015       movdqu(vec1, Address(ary1, len, Address::times_1));
4016       ptest(vec1, vec2);
4017       jccb(Assembler::notZero, BREAK_LOOP);
4018       addptr(len, 16);
4019       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4020 
4021       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4022       jcc(Assembler::zero, DONE);
4023 
4024       // Quick test using the already prepared vector mask
4025       movl(len, result);
4026       andl(len, 0x0000000f);   // tail count (in bytes)
4027       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4028       ptest(vec1, vec2);
4029       jcc(Assembler::zero, DONE);
4030       jmpb(TAIL_START);
4031 
4032       bind(BREAK_LOOP);
4033       // At least one byte in the last 16-byte vector is negative.
4034       // Set up and look at the last 16 bytes as if they were a tail
4035       lea(ary1, Address(ary1, len, Address::times_1));
4036       addptr(result, len);
4037       // Ignore the very last byte: if all others are positive,
4038       // it must be negative, so we can skip right to the 2+1 byte
4039       // end comparison at this point
4040       orl(result, 15);
4041       movl(len, 15);
4042       // Fallthru to tail compare
4043     }
4044   }
4045 
4046   bind(TAIL_START);
4047   // Compare 4-byte vectors
4048   andl(len, 0xfffffffc); // vector count (in bytes)
4049   jccb(Assembler::zero, COMPARE_CHAR);
4050 
4051   lea(ary1, Address(ary1, len, Address::times_1));
4052   negptr(len);
4053 
4054   bind(COMPARE_VECTORS);
4055   movl(tmp1, Address(ary1, len, Address::times_1));
4056   andl(tmp1, 0x80808080);
4057   jccb(Assembler::notZero, TAIL_ADJUST);
4058   addptr(len, 4);
4059   jccb(Assembler::notZero, COMPARE_VECTORS);
4060 
4061   // Compare trailing char (final 2-3 bytes), if any
4062   bind(COMPARE_CHAR);
4063 
4064   testl(result, 0x2);   // tail  char
4065   jccb(Assembler::zero, COMPARE_BYTE);
4066   load_unsigned_short(tmp1, Address(ary1, 0));
4067   andl(tmp1, 0x00008080);
4068   jccb(Assembler::notZero, CHAR_ADJUST);
4069   lea(ary1, Address(ary1, 2));
4070 
4071   bind(COMPARE_BYTE);
4072   testl(result, 0x1);   // tail  byte
4073   jccb(Assembler::zero, DONE);
4074   load_unsigned_byte(tmp1, Address(ary1, 0));
4075   testl(tmp1, 0x00000080);
4076   jccb(Assembler::zero, DONE);
4077   subptr(result, 1);
4078   jmpb(DONE);
4079 
4080   bind(TAIL_ADJUST);
4081   // there are negative bits in the last 4 byte block.
4082   // Adjust result and check the next three bytes
4083   addptr(result, len);
4084   orl(result, 3);
4085   lea(ary1, Address(ary1, len, Address::times_1));
4086   jmpb(COMPARE_CHAR);
4087 
4088   bind(CHAR_ADJUST);
4089   // We are looking at a char + optional byte tail, and found that one
4090   // of the bytes in the char is negative. Adjust the result, check the
4091   // first byte and readjust if needed.
4092   andl(result, 0xfffffffc);
4093   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4094   jccb(Assembler::notZero, DONE);
4095   addptr(result, 1);
4096 
4097   // That's it
4098   bind(DONE);
4099   if (UseAVX >= 2 && UseSSE >= 2) {
4100     // clean upper bits of YMM registers
4101     vpxor(vec1, vec1);
4102     vpxor(vec2, vec2);
4103   }
4104 }
4105 
4106 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4107 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4108                                       Register limit, Register result, Register chr,
4109                                       XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
4110   ShortBranchVerifier sbv(this);
4111   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4112 
4113   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4114   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4115 
4116   if (is_array_equ) {
4117     // Check the input args
4118     cmpoop(ary1, ary2);
4119     jcc(Assembler::equal, TRUE_LABEL);
4120 
4121     // Need additional checks for arrays_equals.
4122     testptr(ary1, ary1);
4123     jcc(Assembler::zero, FALSE_LABEL);
4124     testptr(ary2, ary2);
4125     jcc(Assembler::zero, FALSE_LABEL);
4126 
4127     // Check the lengths
4128     movl(limit, Address(ary1, length_offset));
4129     cmpl(limit, Address(ary2, length_offset));
4130     jcc(Assembler::notEqual, FALSE_LABEL);
4131   }
4132 
4133   // count == 0
4134   testl(limit, limit);
4135   jcc(Assembler::zero, TRUE_LABEL);
4136 
4137   if (is_array_equ) {
4138     // Load array address
4139     lea(ary1, Address(ary1, base_offset));
4140     lea(ary2, Address(ary2, base_offset));
4141   }
4142 
4143   if (is_array_equ && is_char) {
4144     // arrays_equals when used for char[].
4145     shll(limit, 1);      // byte count != 0
4146   }
4147   movl(result, limit); // copy
4148 
4149   if (UseAVX >= 2) {
4150     // With AVX2, use 32-byte vector compare
4151     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4152 
4153     // Compare 32-byte vectors
4154     andl(result, 0x0000001f);  //   tail count (in bytes)
4155     andl(limit, 0xffffffe0);   // vector count (in bytes)
4156     jcc(Assembler::zero, COMPARE_TAIL);
4157 
4158     lea(ary1, Address(ary1, limit, Address::times_1));
4159     lea(ary2, Address(ary2, limit, Address::times_1));
4160     negptr(limit);
4161 
4162 #ifdef _LP64
4163     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4164       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4165 
4166       cmpl(limit, -64);
4167       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4168 
4169       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4170 
4171       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4172       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4173       kortestql(mask, mask);
4174       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4175       addptr(limit, 64);  // update since we already compared at this addr
4176       cmpl(limit, -64);
4177       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4178 
4179       // At this point we may still need to compare -limit+result bytes.
4180       // We could execute the next two instruction and just continue via non-wide path:
4181       //  cmpl(limit, 0);
4182       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4183       // But since we stopped at the points ary{1,2}+limit which are
4184       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4185       // (|limit| <= 32 and result < 32),
4186       // we may just compare the last 64 bytes.
4187       //
4188       addptr(result, -64);   // it is safe, bc we just came from this area
4189       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4190       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4191       kortestql(mask, mask);
4192       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4193 
4194       jmp(TRUE_LABEL);
4195 
4196       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4197 
4198     }//if (VM_Version::supports_avx512vlbw())
4199 #endif //_LP64
4200     bind(COMPARE_WIDE_VECTORS);
4201     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
4202     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4203     vpxor(vec1, vec2);
4204 
4205     vptest(vec1, vec1);
4206     jcc(Assembler::notZero, FALSE_LABEL);
4207     addptr(limit, 32);
4208     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4209 
4210     testl(result, result);
4211     jcc(Assembler::zero, TRUE_LABEL);
4212 
4213     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
4214     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4215     vpxor(vec1, vec2);
4216 
4217     vptest(vec1, vec1);
4218     jccb(Assembler::notZero, FALSE_LABEL);
4219     jmpb(TRUE_LABEL);
4220 
4221     bind(COMPARE_TAIL); // limit is zero
4222     movl(limit, result);
4223     // Fallthru to tail compare
4224   } else if (UseSSE42Intrinsics) {
4225     // With SSE4.2, use double quad vector compare
4226     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4227 
4228     // Compare 16-byte vectors
4229     andl(result, 0x0000000f);  //   tail count (in bytes)
4230     andl(limit, 0xfffffff0);   // vector count (in bytes)
4231     jcc(Assembler::zero, COMPARE_TAIL);
4232 
4233     lea(ary1, Address(ary1, limit, Address::times_1));
4234     lea(ary2, Address(ary2, limit, Address::times_1));
4235     negptr(limit);
4236 
4237     bind(COMPARE_WIDE_VECTORS);
4238     movdqu(vec1, Address(ary1, limit, Address::times_1));
4239     movdqu(vec2, Address(ary2, limit, Address::times_1));
4240     pxor(vec1, vec2);
4241 
4242     ptest(vec1, vec1);
4243     jcc(Assembler::notZero, FALSE_LABEL);
4244     addptr(limit, 16);
4245     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4246 
4247     testl(result, result);
4248     jcc(Assembler::zero, TRUE_LABEL);
4249 
4250     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4251     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4252     pxor(vec1, vec2);
4253 
4254     ptest(vec1, vec1);
4255     jccb(Assembler::notZero, FALSE_LABEL);
4256     jmpb(TRUE_LABEL);
4257 
4258     bind(COMPARE_TAIL); // limit is zero
4259     movl(limit, result);
4260     // Fallthru to tail compare
4261   }
4262 
4263   // Compare 4-byte vectors
4264   andl(limit, 0xfffffffc); // vector count (in bytes)
4265   jccb(Assembler::zero, COMPARE_CHAR);
4266 
4267   lea(ary1, Address(ary1, limit, Address::times_1));
4268   lea(ary2, Address(ary2, limit, Address::times_1));
4269   negptr(limit);
4270 
4271   bind(COMPARE_VECTORS);
4272   movl(chr, Address(ary1, limit, Address::times_1));
4273   cmpl(chr, Address(ary2, limit, Address::times_1));
4274   jccb(Assembler::notEqual, FALSE_LABEL);
4275   addptr(limit, 4);
4276   jcc(Assembler::notZero, COMPARE_VECTORS);
4277 
4278   // Compare trailing char (final 2 bytes), if any
4279   bind(COMPARE_CHAR);
4280   testl(result, 0x2);   // tail  char
4281   jccb(Assembler::zero, COMPARE_BYTE);
4282   load_unsigned_short(chr, Address(ary1, 0));
4283   load_unsigned_short(limit, Address(ary2, 0));
4284   cmpl(chr, limit);
4285   jccb(Assembler::notEqual, FALSE_LABEL);
4286 
4287   if (is_array_equ && is_char) {
4288     bind(COMPARE_BYTE);
4289   } else {
4290     lea(ary1, Address(ary1, 2));
4291     lea(ary2, Address(ary2, 2));
4292 
4293     bind(COMPARE_BYTE);
4294     testl(result, 0x1);   // tail  byte
4295     jccb(Assembler::zero, TRUE_LABEL);
4296     load_unsigned_byte(chr, Address(ary1, 0));
4297     load_unsigned_byte(limit, Address(ary2, 0));
4298     cmpl(chr, limit);
4299     jccb(Assembler::notEqual, FALSE_LABEL);
4300   }
4301   bind(TRUE_LABEL);
4302   movl(result, 1);   // return true
4303   jmpb(DONE);
4304 
4305   bind(FALSE_LABEL);
4306   xorl(result, result); // return false
4307 
4308   // That's it
4309   bind(DONE);
4310   if (UseAVX >= 2) {
4311     // clean upper bits of YMM registers
4312     vpxor(vec1, vec1);
4313     vpxor(vec2, vec2);
4314   }
4315 }
4316 
4317 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4318                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4319   switch(ideal_opc) {
4320     case Op_LShiftVS:
4321       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4322     case Op_LShiftVI:
4323       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4324     case Op_LShiftVL:
4325       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4326     case Op_RShiftVS:
4327       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4328     case Op_RShiftVI:
4329       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4330     case Op_RShiftVL:
4331       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4332     case Op_URShiftVS:
4333       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4334     case Op_URShiftVI:
4335       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4336     case Op_URShiftVL:
4337       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4338     case Op_RotateRightV:
4339       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4340     case Op_RotateLeftV:
4341       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4342     default:
4343       fatal("Unsupported masked operation"); break;
4344   }
4345 }
4346 
4347 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4348                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4349                                     bool is_varshift) {
4350   switch (ideal_opc) {
4351     case Op_AddVB:
4352       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4353     case Op_AddVS:
4354       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4355     case Op_AddVI:
4356       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4357     case Op_AddVL:
4358       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4359     case Op_AddVF:
4360       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4361     case Op_AddVD:
4362       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4363     case Op_SubVB:
4364       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4365     case Op_SubVS:
4366       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4367     case Op_SubVI:
4368       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4369     case Op_SubVL:
4370       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4371     case Op_SubVF:
4372       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4373     case Op_SubVD:
4374       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4375     case Op_MulVS:
4376       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4377     case Op_MulVI:
4378       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4379     case Op_MulVL:
4380       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4381     case Op_MulVF:
4382       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4383     case Op_MulVD:
4384       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4385     case Op_DivVF:
4386       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4387     case Op_DivVD:
4388       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4389     case Op_SqrtVF:
4390       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4391     case Op_SqrtVD:
4392       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4393     case Op_AbsVB:
4394       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4395     case Op_AbsVS:
4396       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4397     case Op_AbsVI:
4398       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4399     case Op_AbsVL:
4400       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4401     case Op_FmaVF:
4402       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4403     case Op_FmaVD:
4404       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4405     case Op_VectorRearrange:
4406       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4407     case Op_LShiftVS:
4408       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4409     case Op_LShiftVI:
4410       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4411     case Op_LShiftVL:
4412       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4413     case Op_RShiftVS:
4414       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4415     case Op_RShiftVI:
4416       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4417     case Op_RShiftVL:
4418       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4419     case Op_URShiftVS:
4420       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4421     case Op_URShiftVI:
4422       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4423     case Op_URShiftVL:
4424       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4425     case Op_RotateLeftV:
4426       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4427     case Op_RotateRightV:
4428       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4429     case Op_MaxV:
4430       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4431     case Op_MinV:
4432       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4433     case Op_XorV:
4434       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4435     case Op_OrV:
4436       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4437     case Op_AndV:
4438       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4439     default:
4440       fatal("Unsupported masked operation"); break;
4441   }
4442 }
4443 
4444 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4445                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4446   switch (ideal_opc) {
4447     case Op_AddVB:
4448       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4449     case Op_AddVS:
4450       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4451     case Op_AddVI:
4452       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4453     case Op_AddVL:
4454       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4455     case Op_AddVF:
4456       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4457     case Op_AddVD:
4458       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4459     case Op_SubVB:
4460       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4461     case Op_SubVS:
4462       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4463     case Op_SubVI:
4464       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4465     case Op_SubVL:
4466       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4467     case Op_SubVF:
4468       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4469     case Op_SubVD:
4470       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4471     case Op_MulVS:
4472       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4473     case Op_MulVI:
4474       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4475     case Op_MulVL:
4476       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4477     case Op_MulVF:
4478       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4479     case Op_MulVD:
4480       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4481     case Op_DivVF:
4482       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4483     case Op_DivVD:
4484       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4485     case Op_FmaVF:
4486       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4487     case Op_FmaVD:
4488       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4489     case Op_MaxV:
4490       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4491     case Op_MinV:
4492       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4493     case Op_XorV:
4494       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4495     case Op_OrV:
4496       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4497     case Op_AndV:
4498       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4499     default:
4500       fatal("Unsupported masked operation"); break;
4501   }
4502 }
4503 
4504 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4505                                   KRegister src1, KRegister src2) {
4506   BasicType etype = T_ILLEGAL;
4507   switch(mask_len) {
4508     case 2:
4509     case 4:
4510     case 8:  etype = T_BYTE; break;
4511     case 16: etype = T_SHORT; break;
4512     case 32: etype = T_INT; break;
4513     case 64: etype = T_LONG; break;
4514     default: fatal("Unsupported type"); break;
4515   }
4516   assert(etype != T_ILLEGAL, "");
4517   switch(ideal_opc) {
4518     case Op_AndVMask:
4519       kand(etype, dst, src1, src2); break;
4520     case Op_OrVMask:
4521       kor(etype, dst, src1, src2); break;
4522     case Op_XorVMask:
4523       kxor(etype, dst, src1, src2); break;
4524     default:
4525       fatal("Unsupported masked operation"); break;
4526   }
4527 }
4528 
4529 /*
4530  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4531  * If src is NaN, the result is 0.
4532  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4533  * the result is equal to the value of Integer.MIN_VALUE.
4534  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4535  * the result is equal to the value of Integer.MAX_VALUE.
4536  */
4537 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4538                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4539                                                                    Register rscratch, AddressLiteral float_sign_flip,
4540                                                                    int vec_enc) {
4541   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4542   Label done;
4543   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4544   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4545   vptest(xtmp2, xtmp2, vec_enc);
4546   jccb(Assembler::equal, done);
4547 
4548   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4549   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4550 
4551   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4552   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4553   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4554 
4555   // Recompute the mask for remaining special value.
4556   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4557   // Extract SRC values corresponding to TRUE mask lanes.
4558   vpand(xtmp4, xtmp2, src, vec_enc);
4559   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4560   // values are set.
4561   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4562 
4563   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4564   bind(done);
4565 }
4566 
4567 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4568                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4569                                                                     Register rscratch, AddressLiteral float_sign_flip,
4570                                                                     int vec_enc) {
4571   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4572   Label done;
4573   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4574   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4575   kortestwl(ktmp1, ktmp1);
4576   jccb(Assembler::equal, done);
4577 
4578   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4579   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4580   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4581 
4582   kxorwl(ktmp1, ktmp1, ktmp2);
4583   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4584   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4585   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4586   bind(done);
4587 }
4588 
4589 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4590                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4591                                                                      Register rscratch, AddressLiteral double_sign_flip,
4592                                                                      int vec_enc) {
4593   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4594 
4595   Label done;
4596   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4597   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4598   kortestwl(ktmp1, ktmp1);
4599   jccb(Assembler::equal, done);
4600 
4601   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4602   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4603   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4604 
4605   kxorwl(ktmp1, ktmp1, ktmp2);
4606   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4607   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4608   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4609   bind(done);
4610 }
4611 
4612 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4613                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4614                                                                      Register rscratch, AddressLiteral float_sign_flip,
4615                                                                      int vec_enc) {
4616   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4617   Label done;
4618   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4619   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4620   kortestwl(ktmp1, ktmp1);
4621   jccb(Assembler::equal, done);
4622 
4623   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4624   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4625   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4626 
4627   kxorwl(ktmp1, ktmp1, ktmp2);
4628   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4629   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4630   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4631   bind(done);
4632 }
4633 
4634 /*
4635  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4636  * If src is NaN, the result is 0.
4637  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4638  * the result is equal to the value of Long.MIN_VALUE.
4639  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4640  * the result is equal to the value of Long.MAX_VALUE.
4641  */
4642 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4643                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4644                                                                       Register rscratch, AddressLiteral double_sign_flip,
4645                                                                       int vec_enc) {
4646   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4647 
4648   Label done;
4649   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4650   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4651   kortestwl(ktmp1, ktmp1);
4652   jccb(Assembler::equal, done);
4653 
4654   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4655   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4656   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4657 
4658   kxorwl(ktmp1, ktmp1, ktmp2);
4659   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4660   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4661   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4662   bind(done);
4663 }
4664 
4665 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
4666                                                              XMMRegister xtmp, int index, int vec_enc) {
4667    assert(vec_enc < Assembler::AVX_512bit, "");
4668    if (vec_enc == Assembler::AVX_256bit) {
4669      vextractf128_high(xtmp, src);
4670      vshufps(dst, src, xtmp, index, vec_enc);
4671    } else {
4672      vshufps(dst, src, zero, index, vec_enc);
4673    }
4674 }
4675 
4676 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4677                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
4678                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
4679   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4680 
4681   Label done;
4682   // Compare the destination lanes with float_sign_flip
4683   // value to get mask for all special values.
4684   movdqu(xtmp1, float_sign_flip, rscratch);
4685   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
4686   ptest(xtmp2, xtmp2);
4687   jccb(Assembler::equal, done);
4688 
4689   // Flip float_sign_flip to get max integer value.
4690   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
4691   pxor(xtmp1, xtmp4);
4692 
4693   // Set detination lanes corresponding to unordered source lanes as zero.
4694   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
4695   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
4696 
4697   // Shuffle mask vector and pack lower doubles word from each quadword lane.
4698   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
4699   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
4700 
4701   // Recompute the mask for remaining special value.
4702   pxor(xtmp2, xtmp3);
4703   // Extract mask corresponding to non-negative source lanes.
4704   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
4705 
4706   // Shuffle mask vector and pack lower doubles word from each quadword lane.
4707   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
4708   pand(xtmp3, xtmp2);
4709 
4710   // Replace destination lanes holding special value(0x80000000) with max int
4711   // if corresponding source lane holds a +ve value.
4712   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
4713   bind(done);
4714 }
4715 
4716 
4717 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
4718                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
4719   switch(to_elem_bt) {
4720     case T_SHORT:
4721       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
4722       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
4723       vpackusdw(dst, dst, zero, vec_enc);
4724       if (vec_enc == Assembler::AVX_256bit) {
4725         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
4726       }
4727       break;
4728     case  T_BYTE:
4729       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
4730       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
4731       vpackusdw(dst, dst, zero, vec_enc);
4732       if (vec_enc == Assembler::AVX_256bit) {
4733         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
4734       }
4735       vpackuswb(dst, dst, zero, vec_enc);
4736       break;
4737     default: assert(false, "%s", type2name(to_elem_bt));
4738   }
4739 }
4740 
4741 /*
4742  * Algorithm for vector D2L and F2I conversions:-
4743  * a) Perform vector D2L/F2I cast.
4744  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
4745  *    It signifies that source value could be any of the special floating point
4746  *    values(NaN,-Inf,Inf,Max,-Min).
4747  * c) Set destination to zero if source is NaN value.
4748  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
4749  */
4750 
4751 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4752                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4753                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
4754   int to_elem_sz = type2aelembytes(to_elem_bt);
4755   assert(to_elem_sz <= 4, "");
4756   vcvttps2dq(dst, src, vec_enc);
4757   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
4758   if (to_elem_sz < 4) {
4759     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4760     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
4761   }
4762 }
4763 
4764 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4765                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
4766                                             Register rscratch, int vec_enc) {
4767   int to_elem_sz = type2aelembytes(to_elem_bt);
4768   assert(to_elem_sz <= 4, "");
4769   vcvttps2dq(dst, src, vec_enc);
4770   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
4771   switch(to_elem_bt) {
4772     case T_INT:
4773       break;
4774     case T_SHORT:
4775       evpmovdw(dst, dst, vec_enc);
4776       break;
4777     case T_BYTE:
4778       evpmovdb(dst, dst, vec_enc);
4779       break;
4780     default: assert(false, "%s", type2name(to_elem_bt));
4781   }
4782 }
4783 
4784 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4785                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
4786                                             Register rscratch, int vec_enc) {
4787   evcvttps2qq(dst, src, vec_enc);
4788   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
4789 }
4790 
4791 // Handling for downcasting from double to integer or sub-word types on AVX2.
4792 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4793                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
4794                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
4795   int to_elem_sz = type2aelembytes(to_elem_bt);
4796   assert(to_elem_sz < 8, "");
4797   vcvttpd2dq(dst, src, vec_enc);
4798   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
4799                                               float_sign_flip, vec_enc);
4800   if (to_elem_sz < 4) {
4801     // xtmp4 holds all zero lanes.
4802     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
4803   }
4804 }
4805 
4806 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
4807                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
4808                                             KRegister ktmp2, AddressLiteral sign_flip,
4809                                             Register rscratch, int vec_enc) {
4810   if (VM_Version::supports_avx512dq()) {
4811     evcvttpd2qq(dst, src, vec_enc);
4812     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
4813     switch(to_elem_bt) {
4814       case T_LONG:
4815         break;
4816       case T_INT:
4817         evpmovsqd(dst, dst, vec_enc);
4818         break;
4819       case T_SHORT:
4820         evpmovsqd(dst, dst, vec_enc);
4821         evpmovdw(dst, dst, vec_enc);
4822         break;
4823       case T_BYTE:
4824         evpmovsqd(dst, dst, vec_enc);
4825         evpmovdb(dst, dst, vec_enc);
4826         break;
4827       default: assert(false, "%s", type2name(to_elem_bt));
4828     }
4829   } else {
4830     assert(type2aelembytes(to_elem_bt) <= 4, "");
4831     vcvttpd2dq(dst, src, vec_enc);
4832     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
4833     switch(to_elem_bt) {
4834       case T_INT:
4835         break;
4836       case T_SHORT:
4837         evpmovdw(dst, dst, vec_enc);
4838         break;
4839       case T_BYTE:
4840         evpmovdb(dst, dst, vec_enc);
4841         break;
4842       default: assert(false, "%s", type2name(to_elem_bt));
4843     }
4844   }
4845 }
4846 
4847 #ifdef _LP64
4848 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
4849                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4850                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
4851   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4852   // and re-instantiate original MXCSR.RC mode after that.
4853   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4854 
4855   mov64(tmp, julong_cast(0.5L));
4856   evpbroadcastq(xtmp1, tmp, vec_enc);
4857   vaddpd(xtmp1, src , xtmp1, vec_enc);
4858   evcvtpd2qq(dst, xtmp1, vec_enc);
4859   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
4860                                                 double_sign_flip, vec_enc);;
4861 
4862   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4863 }
4864 
4865 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
4866                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4867                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
4868   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4869   // and re-instantiate original MXCSR.RC mode after that.
4870   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4871 
4872   movl(tmp, jint_cast(0.5));
4873   movq(xtmp1, tmp);
4874   vbroadcastss(xtmp1, xtmp1, vec_enc);
4875   vaddps(xtmp1, src , xtmp1, vec_enc);
4876   vcvtps2dq(dst, xtmp1, vec_enc);
4877   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
4878                                               float_sign_flip, vec_enc);
4879 
4880   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4881 }
4882 
4883 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
4884                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4885                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
4886   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4887   // and re-instantiate original MXCSR.RC mode after that.
4888   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4889 
4890   movl(tmp, jint_cast(0.5));
4891   movq(xtmp1, tmp);
4892   vbroadcastss(xtmp1, xtmp1, vec_enc);
4893   vaddps(xtmp1, src , xtmp1, vec_enc);
4894   vcvtps2dq(dst, xtmp1, vec_enc);
4895   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
4896 
4897   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4898 }
4899 #endif // _LP64
4900 
4901 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
4902                                              BasicType from_elem_bt, BasicType to_elem_bt) {
4903   switch (from_elem_bt) {
4904     case T_BYTE:
4905       switch (to_elem_bt) {
4906         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
4907         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
4908         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
4909         default: ShouldNotReachHere();
4910       }
4911       break;
4912     case T_SHORT:
4913       switch (to_elem_bt) {
4914         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
4915         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
4916         default: ShouldNotReachHere();
4917       }
4918       break;
4919     case T_INT:
4920       assert(to_elem_bt == T_LONG, "");
4921       vpmovzxdq(dst, src, vlen_enc);
4922       break;
4923     default:
4924       ShouldNotReachHere();
4925   }
4926 }
4927 
4928 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
4929                                            BasicType from_elem_bt, BasicType to_elem_bt) {
4930   switch (from_elem_bt) {
4931     case T_BYTE:
4932       switch (to_elem_bt) {
4933         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
4934         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
4935         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
4936         default: ShouldNotReachHere();
4937       }
4938       break;
4939     case T_SHORT:
4940       switch (to_elem_bt) {
4941         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
4942         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
4943         default: ShouldNotReachHere();
4944       }
4945       break;
4946     case T_INT:
4947       assert(to_elem_bt == T_LONG, "");
4948       vpmovsxdq(dst, src, vlen_enc);
4949       break;
4950     default:
4951       ShouldNotReachHere();
4952   }
4953 }
4954 
4955 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
4956                                          BasicType dst_bt, BasicType src_bt, int vlen) {
4957   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
4958   assert(vlen_enc != AVX_512bit, "");
4959 
4960   int dst_bt_size = type2aelembytes(dst_bt);
4961   int src_bt_size = type2aelembytes(src_bt);
4962   if (dst_bt_size > src_bt_size) {
4963     switch (dst_bt_size / src_bt_size) {
4964       case 2: vpmovsxbw(dst, src, vlen_enc); break;
4965       case 4: vpmovsxbd(dst, src, vlen_enc); break;
4966       case 8: vpmovsxbq(dst, src, vlen_enc); break;
4967       default: ShouldNotReachHere();
4968     }
4969   } else {
4970     assert(dst_bt_size < src_bt_size, "");
4971     switch (src_bt_size / dst_bt_size) {
4972       case 2: {
4973         if (vlen_enc == AVX_128bit) {
4974           vpacksswb(dst, src, src, vlen_enc);
4975         } else {
4976           vpacksswb(dst, src, src, vlen_enc);
4977           vpermq(dst, dst, 0x08, vlen_enc);
4978         }
4979         break;
4980       }
4981       case 4: {
4982         if (vlen_enc == AVX_128bit) {
4983           vpackssdw(dst, src, src, vlen_enc);
4984           vpacksswb(dst, dst, dst, vlen_enc);
4985         } else {
4986           vpackssdw(dst, src, src, vlen_enc);
4987           vpermq(dst, dst, 0x08, vlen_enc);
4988           vpacksswb(dst, dst, dst, AVX_128bit);
4989         }
4990         break;
4991       }
4992       case 8: {
4993         if (vlen_enc == AVX_128bit) {
4994           vpshufd(dst, src, 0x08, vlen_enc);
4995           vpackssdw(dst, dst, dst, vlen_enc);
4996           vpacksswb(dst, dst, dst, vlen_enc);
4997         } else {
4998           vpshufd(dst, src, 0x08, vlen_enc);
4999           vpermq(dst, dst, 0x08, vlen_enc);
5000           vpackssdw(dst, dst, dst, AVX_128bit);
5001           vpacksswb(dst, dst, dst, AVX_128bit);
5002         }
5003         break;
5004       }
5005       default: ShouldNotReachHere();
5006     }
5007   }
5008 }
5009 
5010 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5011                                    bool merge, BasicType bt, int vlen_enc) {
5012   if (bt == T_INT) {
5013     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5014   } else {
5015     assert(bt == T_LONG, "");
5016     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5017   }
5018 }
5019 
5020 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5021                                    bool merge, BasicType bt, int vlen_enc) {
5022   if (bt == T_INT) {
5023     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5024   } else {
5025     assert(bt == T_LONG, "");
5026     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5027   }
5028 }
5029 
5030 #ifdef _LP64
5031 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5032                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5033                                                int vec_enc) {
5034   int index = 0;
5035   int vindex = 0;
5036   mov64(rtmp1, 0x0101010101010101L);
5037   pdepq(rtmp1, src, rtmp1);
5038   if (mask_len > 8) {
5039     movq(rtmp2, src);
5040     vpxor(xtmp, xtmp, xtmp, vec_enc);
5041     movq(xtmp, rtmp1);
5042   }
5043   movq(dst, rtmp1);
5044 
5045   mask_len -= 8;
5046   while (mask_len > 0) {
5047     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5048     index++;
5049     if ((index % 2) == 0) {
5050       pxor(xtmp, xtmp);
5051     }
5052     mov64(rtmp1, 0x0101010101010101L);
5053     shrq(rtmp2, 8);
5054     pdepq(rtmp1, rtmp2, rtmp1);
5055     pinsrq(xtmp, rtmp1, index % 2);
5056     vindex = index / 2;
5057     if (vindex) {
5058       // Write entire 16 byte vector when both 64 bit
5059       // lanes are update to save redundant instructions.
5060       if (index % 2) {
5061         vinsertf128(dst, dst, xtmp, vindex);
5062       }
5063     } else {
5064       vmovdqu(dst, xtmp);
5065     }
5066     mask_len -= 8;
5067   }
5068 }
5069 
5070 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5071   switch(opc) {
5072     case Op_VectorMaskTrueCount:
5073       popcntq(dst, tmp);
5074       break;
5075     case Op_VectorMaskLastTrue:
5076       if (VM_Version::supports_lzcnt()) {
5077         lzcntq(tmp, tmp);
5078         movl(dst, 63);
5079         subl(dst, tmp);
5080       } else {
5081         movl(dst, -1);
5082         bsrq(tmp, tmp);
5083         cmov32(Assembler::notZero, dst, tmp);
5084       }
5085       break;
5086     case Op_VectorMaskFirstTrue:
5087       if (VM_Version::supports_bmi1()) {
5088         if (masklen < 32) {
5089           orl(tmp, 1 << masklen);
5090           tzcntl(dst, tmp);
5091         } else if (masklen == 32) {
5092           tzcntl(dst, tmp);
5093         } else {
5094           assert(masklen == 64, "");
5095           tzcntq(dst, tmp);
5096         }
5097       } else {
5098         if (masklen < 32) {
5099           orl(tmp, 1 << masklen);
5100           bsfl(dst, tmp);
5101         } else {
5102           assert(masklen == 32 || masklen == 64, "");
5103           movl(dst, masklen);
5104           if (masklen == 32)  {
5105             bsfl(tmp, tmp);
5106           } else {
5107             bsfq(tmp, tmp);
5108           }
5109           cmov32(Assembler::notZero, dst, tmp);
5110         }
5111       }
5112       break;
5113     case Op_VectorMaskToLong:
5114       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5115       break;
5116     default: assert(false, "Unhandled mask operation");
5117   }
5118 }
5119 
5120 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5121                                               int masklen, int masksize, int vec_enc) {
5122   assert(VM_Version::supports_popcnt(), "");
5123 
5124   if(VM_Version::supports_avx512bw()) {
5125     kmovql(tmp, mask);
5126   } else {
5127     assert(masklen <= 16, "");
5128     kmovwl(tmp, mask);
5129   }
5130 
5131   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5132   // operations needs to be clipped.
5133   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5134     andq(tmp, (1 << masklen) - 1);
5135   }
5136 
5137   vector_mask_operation_helper(opc, dst, tmp, masklen);
5138 }
5139 
5140 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5141                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5142   assert(vec_enc == AVX_128bit && VM_Version::supports_avx() ||
5143          vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4), "");
5144   assert(VM_Version::supports_popcnt(), "");
5145 
5146   bool need_clip = false;
5147   switch(bt) {
5148     case T_BOOLEAN:
5149       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5150       vpxor(xtmp, xtmp, xtmp, vec_enc);
5151       vpsubb(xtmp, xtmp, mask, vec_enc);
5152       vpmovmskb(tmp, xtmp, vec_enc);
5153       need_clip = masklen < 16;
5154       break;
5155     case T_BYTE:
5156       vpmovmskb(tmp, mask, vec_enc);
5157       need_clip = masklen < 16;
5158       break;
5159     case T_SHORT:
5160       vpacksswb(xtmp, mask, mask, vec_enc);
5161       if (masklen >= 16) {
5162         vpermpd(xtmp, xtmp, 8, vec_enc);
5163       }
5164       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5165       need_clip = masklen < 16;
5166       break;
5167     case T_INT:
5168     case T_FLOAT:
5169       vmovmskps(tmp, mask, vec_enc);
5170       need_clip = masklen < 4;
5171       break;
5172     case T_LONG:
5173     case T_DOUBLE:
5174       vmovmskpd(tmp, mask, vec_enc);
5175       need_clip = masklen < 2;
5176       break;
5177     default: assert(false, "Unhandled type, %s", type2name(bt));
5178   }
5179 
5180   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5181   // operations needs to be clipped.
5182   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5183     // need_clip implies masklen < 32
5184     andq(tmp, (1 << masklen) - 1);
5185   }
5186 
5187   vector_mask_operation_helper(opc, dst, tmp, masklen);
5188 }
5189 
5190 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5191                                              Register rtmp2, int mask_len) {
5192   kmov(rtmp1, src);
5193   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5194   mov64(rtmp2, -1L);
5195   pextq(rtmp2, rtmp2, rtmp1);
5196   kmov(dst, rtmp2);
5197 }
5198 
5199 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5200                                                bool merge, BasicType bt, int vec_enc) {
5201   if (opcode == Op_CompressV) {
5202     switch(bt) {
5203     case T_BYTE:
5204       evpcompressb(dst, mask, src, merge, vec_enc);
5205       break;
5206     case T_CHAR:
5207     case T_SHORT:
5208       evpcompressw(dst, mask, src, merge, vec_enc);
5209       break;
5210     case T_INT:
5211       evpcompressd(dst, mask, src, merge, vec_enc);
5212       break;
5213     case T_FLOAT:
5214       evcompressps(dst, mask, src, merge, vec_enc);
5215       break;
5216     case T_LONG:
5217       evpcompressq(dst, mask, src, merge, vec_enc);
5218       break;
5219     case T_DOUBLE:
5220       evcompresspd(dst, mask, src, merge, vec_enc);
5221       break;
5222     default:
5223       fatal("Unsupported type %s", type2name(bt));
5224       break;
5225     }
5226   } else {
5227     assert(opcode == Op_ExpandV, "");
5228     switch(bt) {
5229     case T_BYTE:
5230       evpexpandb(dst, mask, src, merge, vec_enc);
5231       break;
5232     case T_CHAR:
5233     case T_SHORT:
5234       evpexpandw(dst, mask, src, merge, vec_enc);
5235       break;
5236     case T_INT:
5237       evpexpandd(dst, mask, src, merge, vec_enc);
5238       break;
5239     case T_FLOAT:
5240       evexpandps(dst, mask, src, merge, vec_enc);
5241       break;
5242     case T_LONG:
5243       evpexpandq(dst, mask, src, merge, vec_enc);
5244       break;
5245     case T_DOUBLE:
5246       evexpandpd(dst, mask, src, merge, vec_enc);
5247       break;
5248     default:
5249       fatal("Unsupported type %s", type2name(bt));
5250       break;
5251     }
5252   }
5253 }
5254 #endif
5255 
5256 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5257                                            KRegister ktmp1, int vec_enc) {
5258   if (opcode == Op_SignumVD) {
5259     vsubpd(dst, zero, one, vec_enc);
5260     // if src < 0 ? -1 : 1
5261     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5262     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5263     // if src == NaN, -0.0 or 0.0 return src.
5264     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5265     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5266   } else {
5267     assert(opcode == Op_SignumVF, "");
5268     vsubps(dst, zero, one, vec_enc);
5269     // if src < 0 ? -1 : 1
5270     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5271     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5272     // if src == NaN, -0.0 or 0.0 return src.
5273     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5274     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5275   }
5276 }
5277 
5278 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5279                                           XMMRegister xtmp1, int vec_enc) {
5280   if (opcode == Op_SignumVD) {
5281     vsubpd(dst, zero, one, vec_enc);
5282     // if src < 0 ? -1 : 1
5283     vblendvpd(dst, one, dst, src, vec_enc);
5284     // if src == NaN, -0.0 or 0.0 return src.
5285     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5286     vblendvpd(dst, dst, src, xtmp1, vec_enc);
5287   } else {
5288     assert(opcode == Op_SignumVF, "");
5289     vsubps(dst, zero, one, vec_enc);
5290     // if src < 0 ? -1 : 1
5291     vblendvps(dst, one, dst, src, vec_enc);
5292     // if src == NaN, -0.0 or 0.0 return src.
5293     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5294     vblendvps(dst, dst, src, xtmp1, vec_enc);
5295   }
5296 }
5297 
5298 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5299   if (VM_Version::supports_avx512bw()) {
5300     if (mask_len > 32) {
5301       kmovql(dst, src);
5302     } else {
5303       kmovdl(dst, src);
5304       if (mask_len != 32) {
5305         kshiftrdl(dst, dst, 32 - mask_len);
5306       }
5307     }
5308   } else {
5309     assert(mask_len <= 16, "");
5310     kmovwl(dst, src);
5311     if (mask_len != 16) {
5312       kshiftrwl(dst, dst, 16 - mask_len);
5313     }
5314   }
5315 }
5316 
5317 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5318   int lane_size = type2aelembytes(bt);
5319   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5320   if ((is_LP64 || lane_size < 8) &&
5321       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5322        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5323     movptr(rtmp, imm32);
5324     switch(lane_size) {
5325       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5326       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5327       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5328       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5329       fatal("Unsupported lane size %d", lane_size);
5330       break;
5331     }
5332   } else {
5333     movptr(rtmp, imm32);
5334     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5335     switch(lane_size) {
5336       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5337       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5338       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5339       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5340       fatal("Unsupported lane size %d", lane_size);
5341       break;
5342     }
5343   }
5344 }
5345 
5346 //
5347 // Following is lookup table based popcount computation algorithm:-
5348 //       Index   Bit set count
5349 //     [ 0000 ->   0,
5350 //       0001 ->   1,
5351 //       0010 ->   1,
5352 //       0011 ->   2,
5353 //       0100 ->   1,
5354 //       0101 ->   2,
5355 //       0110 ->   2,
5356 //       0111 ->   3,
5357 //       1000 ->   1,
5358 //       1001 ->   2,
5359 //       1010 ->   3,
5360 //       1011 ->   3,
5361 //       1100 ->   2,
5362 //       1101 ->   3,
5363 //       1111 ->   4 ]
5364 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5365 //     shuffle indices for lookup table access.
5366 //  b. Right shift each byte of vector lane by 4 positions.
5367 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5368 //     shuffle indices for lookup table access.
5369 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5370 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5371 //     count of all the bytes of a quadword.
5372 //  f. Perform step e. for upper 128bit vector lane.
5373 //  g. Pack the bitset count of quadwords back to double word.
5374 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5375 
5376 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5377                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5378   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5379   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5380   vpsrlw(dst, src, 4, vec_enc);
5381   vpand(dst, dst, xtmp1, vec_enc);
5382   vpand(xtmp1, src, xtmp1, vec_enc);
5383   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5384   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5385   vpshufb(dst, xtmp2, dst, vec_enc);
5386   vpaddb(dst, dst, xtmp1, vec_enc);
5387 }
5388 
5389 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5390                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5391   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5392   // Following code is as per steps e,f,g and h of above algorithm.
5393   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5394   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5395   vpsadbw(dst, dst, xtmp2, vec_enc);
5396   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5397   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5398   vpackuswb(dst, xtmp1, dst, vec_enc);
5399 }
5400 
5401 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5402                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5403   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5404   // Add the popcount of upper and lower bytes of word.
5405   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5406   vpsrlw(dst, xtmp1, 8, vec_enc);
5407   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5408   vpaddw(dst, dst, xtmp1, vec_enc);
5409 }
5410 
5411 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5412                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5413   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5414   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5415   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5416 }
5417 
5418 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5419                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5420   switch(bt) {
5421     case T_LONG:
5422       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5423       break;
5424     case T_INT:
5425       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5426       break;
5427     case T_CHAR:
5428     case T_SHORT:
5429       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5430       break;
5431     case T_BYTE:
5432     case T_BOOLEAN:
5433       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5434       break;
5435     default:
5436       fatal("Unsupported type %s", type2name(bt));
5437       break;
5438   }
5439 }
5440 
5441 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5442                                                       KRegister mask, bool merge, int vec_enc) {
5443   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5444   switch(bt) {
5445     case T_LONG:
5446       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5447       evpopcntq(dst, mask, src, merge, vec_enc);
5448       break;
5449     case T_INT:
5450       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5451       evpopcntd(dst, mask, src, merge, vec_enc);
5452       break;
5453     case T_CHAR:
5454     case T_SHORT:
5455       assert(VM_Version::supports_avx512_bitalg(), "");
5456       evpopcntw(dst, mask, src, merge, vec_enc);
5457       break;
5458     case T_BYTE:
5459     case T_BOOLEAN:
5460       assert(VM_Version::supports_avx512_bitalg(), "");
5461       evpopcntb(dst, mask, src, merge, vec_enc);
5462       break;
5463     default:
5464       fatal("Unsupported type %s", type2name(bt));
5465       break;
5466   }
5467 }
5468 
5469 #ifndef _LP64
5470 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
5471   assert(VM_Version::supports_avx512bw(), "");
5472   kmovdl(tmp, src);
5473   kunpckdql(dst, tmp, tmp);
5474 }
5475 #endif
5476 
5477 // Bit reversal algorithm first reverses the bits of each byte followed by
5478 // a byte level reversal for multi-byte primitive types (short/int/long).
5479 // Algorithm performs a lookup table access to get reverse bit sequence
5480 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5481 // is obtained by swapping the reverse bit sequences of upper and lower
5482 // nibble of a byte.
5483 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5484                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5485   if (VM_Version::supports_avx512vlbw()) {
5486 
5487     // Get the reverse bit sequence of lower nibble of each byte.
5488     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5489     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5490     evpandq(dst, xtmp2, src, vec_enc);
5491     vpshufb(dst, xtmp1, dst, vec_enc);
5492     vpsllq(dst, dst, 4, vec_enc);
5493 
5494     // Get the reverse bit sequence of upper nibble of each byte.
5495     vpandn(xtmp2, xtmp2, src, vec_enc);
5496     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5497     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5498 
5499     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5500     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5501     evporq(xtmp2, dst, xtmp2, vec_enc);
5502     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5503 
5504   } else if(vec_enc == Assembler::AVX_512bit) {
5505     // Shift based bit reversal.
5506     assert(bt == T_LONG || bt == T_INT, "");
5507 
5508     // Swap lower and upper nibble of each byte.
5509     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5510 
5511     // Swap two least and most significant bits of each nibble.
5512     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5513 
5514     // Swap adjacent pair of bits.
5515     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5516     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5517 
5518     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5519     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5520   } else {
5521     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5522     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5523 
5524     // Get the reverse bit sequence of lower nibble of each byte.
5525     vpand(dst, xtmp2, src, vec_enc);
5526     vpshufb(dst, xtmp1, dst, vec_enc);
5527     vpsllq(dst, dst, 4, vec_enc);
5528 
5529     // Get the reverse bit sequence of upper nibble of each byte.
5530     vpandn(xtmp2, xtmp2, src, vec_enc);
5531     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5532     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5533 
5534     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5535     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5536     vpor(xtmp2, dst, xtmp2, vec_enc);
5537     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5538   }
5539 }
5540 
5541 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5542                                                 XMMRegister xtmp, Register rscratch) {
5543   assert(VM_Version::supports_gfni(), "");
5544   assert(rscratch != noreg || always_reachable(mask), "missing");
5545 
5546   // Galois field instruction based bit reversal based on following algorithm.
5547   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5548   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5549   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5550   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5551 }
5552 
5553 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5554                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5555   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5556   evpandq(dst, xtmp1, src, vec_enc);
5557   vpsllq(dst, dst, nbits, vec_enc);
5558   vpandn(xtmp1, xtmp1, src, vec_enc);
5559   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5560   evporq(dst, dst, xtmp1, vec_enc);
5561 }
5562 
5563 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5564                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5565   // Shift based bit reversal.
5566   assert(VM_Version::supports_evex(), "");
5567   switch(bt) {
5568     case T_LONG:
5569       // Swap upper and lower double word of each quad word.
5570       evprorq(xtmp1, k0, src, 32, true, vec_enc);
5571       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5572       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5573       break;
5574     case T_INT:
5575       // Swap upper and lower word of each double word.
5576       evprord(xtmp1, k0, src, 16, true, vec_enc);
5577       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5578       break;
5579     case T_CHAR:
5580     case T_SHORT:
5581       // Swap upper and lower byte of each word.
5582       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
5583       break;
5584     case T_BYTE:
5585       evmovdquq(dst, k0, src, true, vec_enc);
5586       break;
5587     default:
5588       fatal("Unsupported type %s", type2name(bt));
5589       break;
5590   }
5591 }
5592 
5593 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5594   if (bt == T_BYTE) {
5595     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
5596       evmovdquq(dst, k0, src, true, vec_enc);
5597     } else {
5598       vmovdqu(dst, src);
5599     }
5600     return;
5601   }
5602   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
5603   // pre-computed shuffle indices.
5604   switch(bt) {
5605     case T_LONG:
5606       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
5607       break;
5608     case T_INT:
5609       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
5610       break;
5611     case T_CHAR:
5612     case T_SHORT:
5613       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
5614       break;
5615     default:
5616       fatal("Unsupported type %s", type2name(bt));
5617       break;
5618   }
5619   vpshufb(dst, src, dst, vec_enc);
5620 }
5621 
5622 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5623                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5624                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
5625   assert(is_integral_type(bt), "");
5626   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5627   assert(VM_Version::supports_avx512cd(), "");
5628   switch(bt) {
5629     case T_LONG:
5630       evplzcntq(dst, ktmp, src, merge, vec_enc);
5631       break;
5632     case T_INT:
5633       evplzcntd(dst, ktmp, src, merge, vec_enc);
5634       break;
5635     case T_SHORT:
5636       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
5637       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
5638       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
5639       vpunpckhwd(dst, xtmp1, src, vec_enc);
5640       evplzcntd(dst, ktmp, dst, merge, vec_enc);
5641       vpackusdw(dst, xtmp2, dst, vec_enc);
5642       break;
5643     case T_BYTE:
5644       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5645       // accessing the lookup table.
5646       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5647       // accessing the lookup table.
5648       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5649       assert(VM_Version::supports_avx512bw(), "");
5650       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
5651       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
5652       vpand(xtmp2, dst, src, vec_enc);
5653       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5654       vpsrlw(xtmp3, src, 4, vec_enc);
5655       vpand(xtmp3, dst, xtmp3, vec_enc);
5656       vpshufb(dst, xtmp1, xtmp3, vec_enc);
5657       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5658       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
5659       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
5660       break;
5661     default:
5662       fatal("Unsupported type %s", type2name(bt));
5663       break;
5664   }
5665 }
5666 
5667 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5668                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5669   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
5670   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5671   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5672   // accessing the lookup table.
5673   vpand(dst, xtmp2, src, vec_enc);
5674   vpshufb(dst, xtmp1, dst, vec_enc);
5675   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5676   // accessing the lookup table.
5677   vpsrlw(xtmp3, src, 4, vec_enc);
5678   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
5679   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
5680   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5681   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5682   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
5683   vpaddb(dst, dst, xtmp2, vec_enc);
5684   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
5685 }
5686 
5687 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5688                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5689   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5690   // Add zero counts of lower byte and upper byte of a word if
5691   // upper byte holds a zero value.
5692   vpsrlw(xtmp3, src, 8, vec_enc);
5693   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
5694   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
5695   vpsllw(xtmp2, dst, 8, vec_enc);
5696   vpaddw(xtmp2, xtmp2, dst, vec_enc);
5697   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5698   vpsrlw(dst, dst, 8, vec_enc);
5699 }
5700 
5701 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5702                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
5703   // Since IEEE 754 floating point format represents mantissa in 1.0 format
5704   // hence biased exponent can be used to compute leading zero count as per
5705   // following formula:-
5706   // LZCNT = 32 - (biased_exp - 127)
5707   // Special handling has been introduced for Zero, Max_Int and -ve source values.
5708 
5709   // Broadcast 0xFF
5710   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
5711   vpsrld(xtmp1, xtmp1, 24, vec_enc);
5712 
5713   // Extract biased exponent.
5714   vcvtdq2ps(dst, src, vec_enc);
5715   vpsrld(dst, dst, 23, vec_enc);
5716   vpand(dst, dst, xtmp1, vec_enc);
5717 
5718   // Broadcast 127.
5719   vpsrld(xtmp1, xtmp1, 1, vec_enc);
5720   // Exponent = biased_exp - 127
5721   vpsubd(dst, dst, xtmp1, vec_enc);
5722 
5723   // Exponent = Exponent  + 1
5724   vpsrld(xtmp3, xtmp1, 6, vec_enc);
5725   vpaddd(dst, dst, xtmp3, vec_enc);
5726 
5727   // Replace -ve exponent with zero, exponent is -ve when src
5728   // lane contains a zero value.
5729   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5730   vblendvps(dst, dst, xtmp2, dst, vec_enc);
5731 
5732   // Rematerialize broadcast 32.
5733   vpslld(xtmp1, xtmp3, 5, vec_enc);
5734   // Exponent is 32 if corresponding source lane contains max_int value.
5735   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
5736   // LZCNT = 32 - exponent
5737   vpsubd(dst, xtmp1, dst, vec_enc);
5738 
5739   // Replace LZCNT with a value 1 if corresponding source lane
5740   // contains max_int value.
5741   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
5742 
5743   // Replace biased_exp with 0 if source lane value is less than zero.
5744   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5745   vblendvps(dst, dst, xtmp2, src, vec_enc);
5746 }
5747 
5748 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5749                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5750   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5751   // Add zero counts of lower word and upper word of a double word if
5752   // upper word holds a zero value.
5753   vpsrld(xtmp3, src, 16, vec_enc);
5754   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
5755   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
5756   vpslld(xtmp2, dst, 16, vec_enc);
5757   vpaddd(xtmp2, xtmp2, dst, vec_enc);
5758   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5759   vpsrld(dst, dst, 16, vec_enc);
5760   // Add zero counts of lower doubleword and upper doubleword of a
5761   // quadword if upper doubleword holds a zero value.
5762   vpsrlq(xtmp3, src, 32, vec_enc);
5763   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
5764   vpsllq(xtmp2, dst, 32, vec_enc);
5765   vpaddq(xtmp2, xtmp2, dst, vec_enc);
5766   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5767   vpsrlq(dst, dst, 32, vec_enc);
5768 }
5769 
5770 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
5771                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5772                                                        Register rtmp, int vec_enc) {
5773   assert(is_integral_type(bt), "unexpected type");
5774   assert(vec_enc < Assembler::AVX_512bit, "");
5775   switch(bt) {
5776     case T_LONG:
5777       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5778       break;
5779     case T_INT:
5780       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
5781       break;
5782     case T_SHORT:
5783       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5784       break;
5785     case T_BYTE:
5786       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5787       break;
5788     default:
5789       fatal("Unsupported type %s", type2name(bt));
5790       break;
5791   }
5792 }
5793 
5794 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
5795   switch(bt) {
5796     case T_BYTE:
5797       vpsubb(dst, src1, src2, vec_enc);
5798       break;
5799     case T_SHORT:
5800       vpsubw(dst, src1, src2, vec_enc);
5801       break;
5802     case T_INT:
5803       vpsubd(dst, src1, src2, vec_enc);
5804       break;
5805     case T_LONG:
5806       vpsubq(dst, src1, src2, vec_enc);
5807       break;
5808     default:
5809       fatal("Unsupported type %s", type2name(bt));
5810       break;
5811   }
5812 }
5813 
5814 // Trailing zero count computation is based on leading zero count operation as per
5815 // following equation. All AVX3 targets support AVX512CD feature which offers
5816 // direct vector instruction to compute leading zero count.
5817 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
5818 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5819                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5820                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
5821   assert(is_integral_type(bt), "");
5822   // xtmp = -1
5823   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
5824   // xtmp = xtmp + src
5825   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
5826   // xtmp = xtmp & ~src
5827   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
5828   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
5829   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
5830   vpsub(bt, dst, xtmp4, dst, vec_enc);
5831 }
5832 
5833 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
5834 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
5835 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5836                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5837   assert(is_integral_type(bt), "");
5838   // xtmp = 0
5839   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
5840   // xtmp = 0 - src
5841   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
5842   // xtmp = xtmp | src
5843   vpor(xtmp3, xtmp3, src, vec_enc);
5844   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
5845   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
5846   vpsub(bt, dst, xtmp1, dst, vec_enc);
5847 }
5848 
5849 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
5850   Label done;
5851   Label neg_divisor_fastpath;
5852   cmpl(divisor, 0);
5853   jccb(Assembler::less, neg_divisor_fastpath);
5854   xorl(rdx, rdx);
5855   divl(divisor);
5856   jmpb(done);
5857   bind(neg_divisor_fastpath);
5858   // Fastpath for divisor < 0:
5859   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
5860   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
5861   movl(rdx, rax);
5862   subl(rdx, divisor);
5863   if (VM_Version::supports_bmi1()) {
5864     andnl(rax, rdx, rax);
5865   } else {
5866     notl(rdx);
5867     andl(rax, rdx);
5868   }
5869   shrl(rax, 31);
5870   bind(done);
5871 }
5872 
5873 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
5874   Label done;
5875   Label neg_divisor_fastpath;
5876   cmpl(divisor, 0);
5877   jccb(Assembler::less, neg_divisor_fastpath);
5878   xorl(rdx, rdx);
5879   divl(divisor);
5880   jmpb(done);
5881   bind(neg_divisor_fastpath);
5882   // Fastpath when divisor < 0:
5883   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
5884   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
5885   movl(rdx, rax);
5886   subl(rax, divisor);
5887   if (VM_Version::supports_bmi1()) {
5888     andnl(rax, rax, rdx);
5889   } else {
5890     notl(rax);
5891     andl(rax, rdx);
5892   }
5893   sarl(rax, 31);
5894   andl(rax, divisor);
5895   subl(rdx, rax);
5896   bind(done);
5897 }
5898 
5899 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
5900   Label done;
5901   Label neg_divisor_fastpath;
5902 
5903   cmpl(divisor, 0);
5904   jccb(Assembler::less, neg_divisor_fastpath);
5905   xorl(rdx, rdx);
5906   divl(divisor);
5907   jmpb(done);
5908   bind(neg_divisor_fastpath);
5909   // Fastpath for divisor < 0:
5910   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
5911   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
5912   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
5913   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
5914   movl(rdx, rax);
5915   subl(rax, divisor);
5916   if (VM_Version::supports_bmi1()) {
5917     andnl(rax, rax, rdx);
5918   } else {
5919     notl(rax);
5920     andl(rax, rdx);
5921   }
5922   movl(tmp, rax);
5923   shrl(rax, 31); // quotient
5924   sarl(tmp, 31);
5925   andl(tmp, divisor);
5926   subl(rdx, tmp); // remainder
5927   bind(done);
5928 }
5929 
5930 #ifdef _LP64
5931 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
5932                                  XMMRegister xtmp2, Register rtmp) {
5933   if(VM_Version::supports_gfni()) {
5934     // Galois field instruction based bit reversal based on following algorithm.
5935     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5936     mov64(rtmp, 0x8040201008040201L);
5937     movq(xtmp1, src);
5938     movq(xtmp2, rtmp);
5939     gf2p8affineqb(xtmp1, xtmp2, 0);
5940     movq(dst, xtmp1);
5941   } else {
5942     // Swap even and odd numbered bits.
5943     movl(rtmp, src);
5944     andl(rtmp, 0x55555555);
5945     shll(rtmp, 1);
5946     movl(dst, src);
5947     andl(dst, 0xAAAAAAAA);
5948     shrl(dst, 1);
5949     orl(dst, rtmp);
5950 
5951     // Swap LSB and MSB 2 bits of each nibble.
5952     movl(rtmp, dst);
5953     andl(rtmp, 0x33333333);
5954     shll(rtmp, 2);
5955     andl(dst, 0xCCCCCCCC);
5956     shrl(dst, 2);
5957     orl(dst, rtmp);
5958 
5959     // Swap LSB and MSB 4 bits of each byte.
5960     movl(rtmp, dst);
5961     andl(rtmp, 0x0F0F0F0F);
5962     shll(rtmp, 4);
5963     andl(dst, 0xF0F0F0F0);
5964     shrl(dst, 4);
5965     orl(dst, rtmp);
5966   }
5967   bswapl(dst);
5968 }
5969 
5970 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
5971                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
5972   if(VM_Version::supports_gfni()) {
5973     // Galois field instruction based bit reversal based on following algorithm.
5974     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5975     mov64(rtmp1, 0x8040201008040201L);
5976     movq(xtmp1, src);
5977     movq(xtmp2, rtmp1);
5978     gf2p8affineqb(xtmp1, xtmp2, 0);
5979     movq(dst, xtmp1);
5980   } else {
5981     // Swap even and odd numbered bits.
5982     movq(rtmp1, src);
5983     mov64(rtmp2, 0x5555555555555555L);
5984     andq(rtmp1, rtmp2);
5985     shlq(rtmp1, 1);
5986     movq(dst, src);
5987     notq(rtmp2);
5988     andq(dst, rtmp2);
5989     shrq(dst, 1);
5990     orq(dst, rtmp1);
5991 
5992     // Swap LSB and MSB 2 bits of each nibble.
5993     movq(rtmp1, dst);
5994     mov64(rtmp2, 0x3333333333333333L);
5995     andq(rtmp1, rtmp2);
5996     shlq(rtmp1, 2);
5997     notq(rtmp2);
5998     andq(dst, rtmp2);
5999     shrq(dst, 2);
6000     orq(dst, rtmp1);
6001 
6002     // Swap LSB and MSB 4 bits of each byte.
6003     movq(rtmp1, dst);
6004     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6005     andq(rtmp1, rtmp2);
6006     shlq(rtmp1, 4);
6007     notq(rtmp2);
6008     andq(dst, rtmp2);
6009     shrq(dst, 4);
6010     orq(dst, rtmp1);
6011   }
6012   bswapq(dst);
6013 }
6014 
6015 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6016   Label done;
6017   Label neg_divisor_fastpath;
6018   cmpq(divisor, 0);
6019   jccb(Assembler::less, neg_divisor_fastpath);
6020   xorl(rdx, rdx);
6021   divq(divisor);
6022   jmpb(done);
6023   bind(neg_divisor_fastpath);
6024   // Fastpath for divisor < 0:
6025   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6026   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6027   movq(rdx, rax);
6028   subq(rdx, divisor);
6029   if (VM_Version::supports_bmi1()) {
6030     andnq(rax, rdx, rax);
6031   } else {
6032     notq(rdx);
6033     andq(rax, rdx);
6034   }
6035   shrq(rax, 63);
6036   bind(done);
6037 }
6038 
6039 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6040   Label done;
6041   Label neg_divisor_fastpath;
6042   cmpq(divisor, 0);
6043   jccb(Assembler::less, neg_divisor_fastpath);
6044   xorq(rdx, rdx);
6045   divq(divisor);
6046   jmp(done);
6047   bind(neg_divisor_fastpath);
6048   // Fastpath when divisor < 0:
6049   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6050   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6051   movq(rdx, rax);
6052   subq(rax, divisor);
6053   if (VM_Version::supports_bmi1()) {
6054     andnq(rax, rax, rdx);
6055   } else {
6056     notq(rax);
6057     andq(rax, rdx);
6058   }
6059   sarq(rax, 63);
6060   andq(rax, divisor);
6061   subq(rdx, rax);
6062   bind(done);
6063 }
6064 
6065 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6066   Label done;
6067   Label neg_divisor_fastpath;
6068   cmpq(divisor, 0);
6069   jccb(Assembler::less, neg_divisor_fastpath);
6070   xorq(rdx, rdx);
6071   divq(divisor);
6072   jmp(done);
6073   bind(neg_divisor_fastpath);
6074   // Fastpath for divisor < 0:
6075   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6076   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6077   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6078   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6079   movq(rdx, rax);
6080   subq(rax, divisor);
6081   if (VM_Version::supports_bmi1()) {
6082     andnq(rax, rax, rdx);
6083   } else {
6084     notq(rax);
6085     andq(rax, rdx);
6086   }
6087   movq(tmp, rax);
6088   shrq(rax, 63); // quotient
6089   sarq(tmp, 63);
6090   andq(tmp, divisor);
6091   subq(rdx, tmp); // remainder
6092   bind(done);
6093 }
6094 #endif
6095 
6096 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6097                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6098                                         int vlen_enc) {
6099   assert(VM_Version::supports_avx512bw(), "");
6100   // Byte shuffles are inlane operations and indices are determined using
6101   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6102   // normalized to index range 0-15. This makes sure that all the multiples
6103   // of an index value are placed at same relative position in 128 bit
6104   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6105   // will be 16th element in their respective 128 bit lanes.
6106   movl(rtmp, 16);
6107   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6108 
6109   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6110   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6111   // original shuffle indices and move the shuffled lanes corresponding to true
6112   // mask to destination vector.
6113   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6114   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6115   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6116 
6117   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6118   // and broadcasting second 128 bit lane.
6119   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6120   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6121   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6122   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6123   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6124 
6125   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6126   // and broadcasting third 128 bit lane.
6127   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6128   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6129   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6130   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6131   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6132 
6133   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6134   // and broadcasting third 128 bit lane.
6135   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6136   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6137   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6138   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6139   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6140 }
6141