1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/globals.hpp"
  37 #include "runtime/objectMonitor.hpp"
  38 #include "runtime/stubRoutines.hpp"
  39 #include "utilities/checkedCast.hpp"
  40 #include "utilities/globalDefinitions.hpp"
  41 #include "utilities/powerOfTwo.hpp"
  42 #include "utilities/sizes.hpp"
  43 
  44 #ifdef PRODUCT
  45 #define BLOCK_COMMENT(str) /* nothing */
  46 #define STOP(error) stop(error)
  47 #else
  48 #define BLOCK_COMMENT(str) block_comment(str)
  49 #define STOP(error) block_comment(error); stop(error)
  50 #endif
  51 
  52 // C2 compiled method's prolog code.
  53 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  54 
  55   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  56   // NativeJump::patch_verified_entry will be able to patch out the entry
  57   // code safely. The push to verify stack depth is ok at 5 bytes,
  58   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  59   // stack bang then we must use the 6 byte frame allocation even if
  60   // we have no frame. :-(
  61   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  62 
  63   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  64   // Remove word for return addr
  65   framesize -= wordSize;
  66   stack_bang_size -= wordSize;
  67 
  68   // Calls to C2R adapters often do not accept exceptional returns.
  69   // We require that their callers must bang for them.  But be careful, because
  70   // some VM calls (such as call site linkage) can use several kilobytes of
  71   // stack.  But the stack safety zone should account for that.
  72   // See bugs 4446381, 4468289, 4497237.
  73   if (stack_bang_size > 0) {
  74     generate_stack_overflow_check(stack_bang_size);
  75 
  76     // We always push rbp, so that on return to interpreter rbp, will be
  77     // restored correctly and we can correct the stack.
  78     push(rbp);
  79     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  80     if (PreserveFramePointer) {
  81       mov(rbp, rsp);
  82     }
  83     // Remove word for ebp
  84     framesize -= wordSize;
  85 
  86     // Create frame
  87     if (framesize) {
  88       subptr(rsp, framesize);
  89     }
  90   } else {
  91     // Create frame (force generation of a 4 byte immediate value)
  92     subptr_imm32(rsp, framesize);
  93 
  94     // Save RBP register now.
  95     framesize -= wordSize;
  96     movptr(Address(rsp, framesize), rbp);
  97     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  98     if (PreserveFramePointer) {
  99       movptr(rbp, rsp);
 100       if (framesize > 0) {
 101         addptr(rbp, framesize);
 102       }
 103     }
 104   }
 105 
 106   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 107     framesize -= wordSize;
 108     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 109   }
 110 
 111 #ifndef _LP64
 112   // If method sets FPU control word do it now
 113   if (fp_mode_24b) {
 114     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 115   }
 116   if (UseSSE >= 2 && VerifyFPU) {
 117     verify_FPU(0, "FPU stack must be clean on entry");
 118   }
 119 #endif
 120 
 121 #ifdef ASSERT
 122   if (VerifyStackAtCalls) {
 123     Label L;
 124     push(rax);
 125     mov(rax, rsp);
 126     andptr(rax, StackAlignmentInBytes-1);
 127     cmpptr(rax, StackAlignmentInBytes-wordSize);
 128     pop(rax);
 129     jcc(Assembler::equal, L);
 130     STOP("Stack is not properly aligned!");
 131     bind(L);
 132   }
 133 #endif
 134 
 135   if (!is_stub) {
 136     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 137  #ifdef _LP64
 138     if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
 139       // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 140       Label dummy_slow_path;
 141       Label dummy_continuation;
 142       Label* slow_path = &dummy_slow_path;
 143       Label* continuation = &dummy_continuation;
 144       if (!Compile::current()->output()->in_scratch_emit_size()) {
 145         // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 146         C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 147         Compile::current()->output()->add_stub(stub);
 148         slow_path = &stub->entry();
 149         continuation = &stub->continuation();
 150       }
 151       bs->nmethod_entry_barrier(this, slow_path, continuation);
 152     }
 153 #else
 154     // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 155     bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */);
 156 #endif
 157   }
 158 }
 159 
 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 161   switch (vlen_in_bytes) {
 162     case  4: // fall-through
 163     case  8: // fall-through
 164     case 16: return Assembler::AVX_128bit;
 165     case 32: return Assembler::AVX_256bit;
 166     case 64: return Assembler::AVX_512bit;
 167 
 168     default: {
 169       ShouldNotReachHere();
 170       return Assembler::AVX_NoVec;
 171     }
 172   }
 173 }
 174 
 175 #if INCLUDE_RTM_OPT
 176 
 177 // Update rtm_counters based on abort status
 178 // input: abort_status
 179 //        rtm_counters (RTMLockingCounters*)
 180 // flags are killed
 181 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
 182 
 183   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
 184   if (PrintPreciseRTMLockingStatistics) {
 185     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
 186       Label check_abort;
 187       testl(abort_status, (1<<i));
 188       jccb(Assembler::equal, check_abort);
 189       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
 190       bind(check_abort);
 191     }
 192   }
 193 }
 194 
 195 // Branch if (random & (count-1) != 0), count is 2^n
 196 // tmp, scr and flags are killed
 197 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
 198   assert(tmp == rax, "");
 199   assert(scr == rdx, "");
 200   rdtsc(); // modifies EDX:EAX
 201   andptr(tmp, count-1);
 202   jccb(Assembler::notZero, brLabel);
 203 }
 204 
 205 // Perform abort ratio calculation, set no_rtm bit if high ratio
 206 // input:  rtm_counters_Reg (RTMLockingCounters* address)
 207 // tmpReg, rtm_counters_Reg and flags are killed
 208 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
 209                                                     Register rtm_counters_Reg,
 210                                                     RTMLockingCounters* rtm_counters,
 211                                                     Metadata* method_data) {
 212   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 213 
 214   if (RTMLockingCalculationDelay > 0) {
 215     // Delay calculation
 216     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()));
 217     testptr(tmpReg, tmpReg);
 218     jccb(Assembler::equal, L_done);
 219   }
 220   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 221   //   Aborted transactions = abort_count * 100
 222   //   All transactions = total_count *  RTMTotalCountIncrRate
 223   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 224 
 225   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 226   cmpptr(tmpReg, RTMAbortThreshold);
 227   jccb(Assembler::below, L_check_always_rtm2);
 228   imulptr(tmpReg, tmpReg, 100);
 229 
 230   Register scrReg = rtm_counters_Reg;
 231   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 232   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 233   imulptr(scrReg, scrReg, RTMAbortRatio);
 234   cmpptr(tmpReg, scrReg);
 235   jccb(Assembler::below, L_check_always_rtm1);
 236   if (method_data != nullptr) {
 237     // set rtm_state to "no rtm" in MDO
 238     mov_metadata(tmpReg, method_data);
 239     lock();
 240     orl(Address(tmpReg, MethodData::rtm_state_offset()), NoRTM);
 241   }
 242   jmpb(L_done);
 243   bind(L_check_always_rtm1);
 244   // Reload RTMLockingCounters* address
 245   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 246   bind(L_check_always_rtm2);
 247   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 248   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 249   jccb(Assembler::below, L_done);
 250   if (method_data != nullptr) {
 251     // set rtm_state to "always rtm" in MDO
 252     mov_metadata(tmpReg, method_data);
 253     lock();
 254     orl(Address(tmpReg, MethodData::rtm_state_offset()), UseRTM);
 255   }
 256   bind(L_done);
 257 }
 258 
 259 // Update counters and perform abort ratio calculation
 260 // input:  abort_status_Reg
 261 // rtm_counters_Reg, flags are killed
 262 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 263                                       Register rtm_counters_Reg,
 264                                       RTMLockingCounters* rtm_counters,
 265                                       Metadata* method_data,
 266                                       bool profile_rtm) {
 267 
 268   assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 269   // update rtm counters based on rax value at abort
 270   // reads abort_status_Reg, updates flags
 271   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 272   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 273   if (profile_rtm) {
 274     // Save abort status because abort_status_Reg is used by following code.
 275     if (RTMRetryCount > 0) {
 276       push(abort_status_Reg);
 277     }
 278     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 279     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 280     // restore abort status
 281     if (RTMRetryCount > 0) {
 282       pop(abort_status_Reg);
 283     }
 284   }
 285 }
 286 
 287 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 288 // inputs: retry_count_Reg
 289 //       : abort_status_Reg
 290 // output: retry_count_Reg decremented by 1
 291 // flags are killed
 292 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 293   Label doneRetry;
 294   assert(abort_status_Reg == rax, "");
 295   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 296   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 297   // if reason is in 0x6 and retry count != 0 then retry
 298   andptr(abort_status_Reg, 0x6);
 299   jccb(Assembler::zero, doneRetry);
 300   testl(retry_count_Reg, retry_count_Reg);
 301   jccb(Assembler::zero, doneRetry);
 302   pause();
 303   decrementl(retry_count_Reg);
 304   jmp(retryLabel);
 305   bind(doneRetry);
 306 }
 307 
 308 // Spin and retry if lock is busy,
 309 // inputs: box_Reg (monitor address)
 310 //       : retry_count_Reg
 311 // output: retry_count_Reg decremented by 1
 312 //       : clear z flag if retry count exceeded
 313 // tmp_Reg, scr_Reg, flags are killed
 314 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 315                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 316   Label SpinLoop, SpinExit, doneRetry;
 317   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 318 
 319   testl(retry_count_Reg, retry_count_Reg);
 320   jccb(Assembler::zero, doneRetry);
 321   decrementl(retry_count_Reg);
 322   movptr(scr_Reg, RTMSpinLoopCount);
 323 
 324   bind(SpinLoop);
 325   pause();
 326   decrementl(scr_Reg);
 327   jccb(Assembler::lessEqual, SpinExit);
 328   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 329   testptr(tmp_Reg, tmp_Reg);
 330   jccb(Assembler::notZero, SpinLoop);
 331 
 332   bind(SpinExit);
 333   jmp(retryLabel);
 334   bind(doneRetry);
 335   incrementl(retry_count_Reg); // clear z flag
 336 }
 337 
 338 // Use RTM for normal stack locks
 339 // Input: objReg (object to lock)
 340 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 341                                          Register retry_on_abort_count_Reg,
 342                                          RTMLockingCounters* stack_rtm_counters,
 343                                          Metadata* method_data, bool profile_rtm,
 344                                          Label& DONE_LABEL, Label& IsInflated) {
 345   assert(UseRTMForStackLocks, "why call this otherwise?");
 346   assert(tmpReg == rax, "");
 347   assert(scrReg == rdx, "");
 348   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 349 
 350   if (RTMRetryCount > 0) {
 351     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 352     bind(L_rtm_retry);
 353   }
 354   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 355   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral
 356   jcc(Assembler::notZero, IsInflated);
 357 
 358   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 359     Label L_noincrement;
 360     if (RTMTotalCountIncrRate > 1) {
 361       // tmpReg, scrReg and flags are killed
 362       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 363     }
 364     assert(stack_rtm_counters != nullptr, "should not be null when profiling RTM");
 365     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 366     bind(L_noincrement);
 367   }
 368   xbegin(L_on_abort);
 369   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 370   andptr(tmpReg, markWord::lock_mask_in_place);     // look at 2 lock bits
 371   cmpptr(tmpReg, markWord::unlocked_value);         // bits = 01 unlocked
 372   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 373 
 374   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 375   if (UseRTMXendForLockBusy) {
 376     xend();
 377     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 378     jmp(L_decrement_retry);
 379   }
 380   else {
 381     xabort(0);
 382   }
 383   bind(L_on_abort);
 384   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 385     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 386   }
 387   bind(L_decrement_retry);
 388   if (RTMRetryCount > 0) {
 389     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 390     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 391   }
 392 }
 393 
 394 // Use RTM for inflating locks
 395 // inputs: objReg (object to lock)
 396 //         boxReg (on-stack box address (displaced header location) - KILLED)
 397 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 398 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 399                                             Register scrReg, Register retry_on_busy_count_Reg,
 400                                             Register retry_on_abort_count_Reg,
 401                                             RTMLockingCounters* rtm_counters,
 402                                             Metadata* method_data, bool profile_rtm,
 403                                             Label& DONE_LABEL) {
 404   assert(UseRTMLocking, "why call this otherwise?");
 405   assert(tmpReg == rax, "");
 406   assert(scrReg == rdx, "");
 407   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 408   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 409 
 410   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 411   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 412 
 413   if (RTMRetryCount > 0) {
 414     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 415     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 416     bind(L_rtm_retry);
 417   }
 418   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 419     Label L_noincrement;
 420     if (RTMTotalCountIncrRate > 1) {
 421       // tmpReg, scrReg and flags are killed
 422       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 423     }
 424     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 425     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 426     bind(L_noincrement);
 427   }
 428   xbegin(L_on_abort);
 429   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 430   movptr(tmpReg, Address(tmpReg, owner_offset));
 431   testptr(tmpReg, tmpReg);
 432   jcc(Assembler::zero, DONE_LABEL);
 433   if (UseRTMXendForLockBusy) {
 434     xend();
 435     jmp(L_decrement_retry);
 436   }
 437   else {
 438     xabort(0);
 439   }
 440   bind(L_on_abort);
 441   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 442   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 443     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 444   }
 445   if (RTMRetryCount > 0) {
 446     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 447     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 448   }
 449 
 450   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 451   testptr(tmpReg, tmpReg) ;
 452   jccb(Assembler::notZero, L_decrement_retry) ;
 453 
 454   // Appears unlocked - try to swing _owner from null to non-null.
 455   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 456 #ifdef _LP64
 457   Register threadReg = r15_thread;
 458 #else
 459   get_thread(scrReg);
 460   Register threadReg = scrReg;
 461 #endif
 462   movptr(scrReg, Address(threadReg, JavaThread::lock_id_offset()));
 463   lock();
 464   cmpxchgptr(scrReg, Address(boxReg, owner_offset)); // Updates tmpReg
 465 
 466   if (RTMRetryCount > 0) {
 467     // success done else retry
 468     jccb(Assembler::equal, DONE_LABEL) ;
 469     bind(L_decrement_retry);
 470     // Spin and retry if lock is busy.
 471     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 472     jmp(DONE_LABEL);
 473   }
 474   else {
 475     bind(L_decrement_retry);
 476     jmp(DONE_LABEL);
 477   }
 478 }
 479 
 480 #endif //  INCLUDE_RTM_OPT
 481 
 482 // fast_lock and fast_unlock used by C2
 483 
 484 // Because the transitions from emitted code to the runtime
 485 // monitorenter/exit helper stubs are so slow it's critical that
 486 // we inline both the stack-locking fast path and the inflated fast path.
 487 //
 488 // See also: cmpFastLock and cmpFastUnlock.
 489 //
 490 // What follows is a specialized inline transliteration of the code
 491 // in enter() and exit(). If we're concerned about I$ bloat another
 492 // option would be to emit TrySlowEnter and TrySlowExit methods
 493 // at startup-time.  These methods would accept arguments as
 494 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 495 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 496 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 497 // In practice, however, the # of lock sites is bounded and is usually small.
 498 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 499 // if the processor uses simple bimodal branch predictors keyed by EIP
 500 // Since the helper routines would be called from multiple synchronization
 501 // sites.
 502 //
 503 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 504 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 505 // to those specialized methods.  That'd give us a mostly platform-independent
 506 // implementation that the JITs could optimize and inline at their pleasure.
 507 // Done correctly, the only time we'd need to cross to native could would be
 508 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 509 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 510 // (b) explicit barriers or fence operations.
 511 //
 512 // TODO:
 513 //
 514 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 515 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 516 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 517 //    the lock operators would typically be faster than reifying Self.
 518 //
 519 // *  Ideally I'd define the primitives as:
 520 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 521 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 522 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 523 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 524 //    Furthermore the register assignments are overconstrained, possibly resulting in
 525 //    sub-optimal code near the synchronization site.
 526 //
 527 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 528 //    Alternately, use a better sp-proximity test.
 529 //
 530 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 531 //    Either one is sufficient to uniquely identify a thread.
 532 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 533 //
 534 // *  Intrinsify notify() and notifyAll() for the common cases where the
 535 //    object is locked by the calling thread but the waitlist is empty.
 536 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 537 //
 538 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 539 //    But beware of excessive branch density on AMD Opterons.
 540 //
 541 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 542 //    or failure of the fast path.  If the fast path fails then we pass
 543 //    control to the slow path, typically in C.  In fast_lock and
 544 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 545 //    will emit a conditional branch immediately after the node.
 546 //    So we have branches to branches and lots of ICC.ZF games.
 547 //    Instead, it might be better to have C2 pass a "FailureLabel"
 548 //    into fast_lock and fast_unlock.  In the case of success, control
 549 //    will drop through the node.  ICC.ZF is undefined at exit.
 550 //    In the case of failure, the node will branch directly to the
 551 //    FailureLabel
 552 
 553 
 554 // obj: object to lock
 555 // box: on-stack box address (displaced header location) - KILLED
 556 // rax,: tmp -- KILLED
 557 // scr: tmp -- KILLED
 558 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 559                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 560                                  RTMLockingCounters* rtm_counters,
 561                                  RTMLockingCounters* stack_rtm_counters,
 562                                  Metadata* method_data,
 563                                  bool use_rtm, bool profile_rtm) {
 564   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 565   // Ensure the register assignments are disjoint
 566   assert(tmpReg == rax, "");
 567 
 568   if (use_rtm) {
 569     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 570   } else {
 571     assert(cx1Reg == noreg, "");
 572     assert(cx2Reg == noreg, "");
 573     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 574   }
 575 
 576   // Possible cases that we'll encounter in fast_lock
 577   // ------------------------------------------------
 578   // * Inflated
 579   //    -- unlocked
 580   //    -- Locked
 581   //       = by self
 582   //       = by other
 583   // * neutral
 584   // * stack-locked
 585   //    -- by self
 586   //       = sp-proximity test hits
 587   //       = sp-proximity test generates false-negative
 588   //    -- by other
 589   //
 590 
 591   Label IsInflated, DONE_LABEL, COUNT;
 592 
 593   if (DiagnoseSyncOnValueBasedClasses != 0) {
 594     load_klass(tmpReg, objReg, scrReg);
 595     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 596     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 597     jcc(Assembler::notZero, DONE_LABEL);
 598   }
 599 
 600 #if INCLUDE_RTM_OPT
 601   if (UseRTMForStackLocks && use_rtm) {
 602     assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive");
 603     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 604                       stack_rtm_counters, method_data, profile_rtm,
 605                       DONE_LABEL, IsInflated);
 606   }
 607 #endif // INCLUDE_RTM_OPT
 608 
 609   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 610   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 611   jcc(Assembler::notZero, IsInflated);
 612 
 613   if (LockingMode == LM_MONITOR) {
 614     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 615     testptr(objReg, objReg);
 616   } else {
 617     assert(LockingMode == LM_LEGACY, "must be");
 618     // Attempt stack-locking ...
 619     orptr (tmpReg, markWord::unlocked_value);
 620     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 621     lock();
 622     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 623     jcc(Assembler::equal, COUNT);           // Success
 624 
 625     // Recursive locking.
 626     // The object is stack-locked: markword contains stack pointer to BasicLock.
 627     // Locked by current thread if difference with current SP is less than one page.
 628     subptr(tmpReg, rsp);
 629     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 630     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) );
 631     movptr(Address(boxReg, 0), tmpReg);
 632   }
 633   // After recursive stack locking attempt case
 634   jmp(DONE_LABEL);
 635 
 636   bind(IsInflated);
 637   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 638 
 639 #if INCLUDE_RTM_OPT
 640   // Use the same RTM locking code in 32- and 64-bit VM.
 641   if (use_rtm) {
 642     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 643                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 644   } else {
 645 #endif // INCLUDE_RTM_OPT
 646 
 647 #ifndef _LP64
 648   // The object is inflated.
 649 
 650   // boxReg refers to the on-stack BasicLock in the current frame.
 651   // We'd like to write:
 652   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 653   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 654   // additional latency as we have another ST in the store buffer that must drain.
 655 
 656   // avoid ST-before-CAS
 657   // register juggle because we need tmpReg for cmpxchgptr below
 658   movptr(scrReg, boxReg);
 659   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 660 
 661   // Optimistic form: consider XORL tmpReg,tmpReg
 662   movptr(tmpReg, NULL_WORD);
 663 
 664   // Appears unlocked - try to swing _owner from null to non-null.
 665   // Ideally, I'd manifest "Self" with get_thread and then attempt
 666   // to CAS the register containing thread id into m->Owner.
 667   // But we don't have enough registers, so instead we can either try to CAS
 668   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 669   // we later store thread id into m->Owner.  Transiently storing a stack address
 670   // (rsp or the address of the box) into  m->owner is harmless.
 671   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 672   lock();
 673   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 674   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 675   // If we weren't able to swing _owner from null to the BasicLock
 676   // then take the slow path.
 677   jccb  (Assembler::notZero, DONE_LABEL);
 678   // update _owner from BasicLock to thread
 679   get_thread (scrReg);                    // beware: clobbers ICCs
 680   movptr(scrReg, Address(scrReg, JavaThread::lock_id_offset()));
 681   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 682   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 683   jmp(DONE_LABEL);
 684 
 685   // If the CAS fails we can either retry or pass control to the slow path.
 686   // We use the latter tactic.
 687   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 688   // If the CAS was successful ...
 689   //   Self has acquired the lock
 690   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 691   // Intentional fall-through into DONE_LABEL ...
 692 #else // _LP64
 693   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 694   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 695   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 696 
 697   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 698   movq(scrReg, tmpReg);
 699   xorq(tmpReg, tmpReg);
 700   movptr(boxReg, Address(r15_thread, JavaThread::lock_id_offset()));
 701   lock();
 702   cmpxchgptr(boxReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 703 
 704   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 705   jccb(Assembler::equal, DONE_LABEL);    // CAS above succeeded; propagate ZF = 1 (success)
 706 
 707   cmpptr(boxReg, rax);                // Check if we are already the owner (recursive lock)
 708   jccb(Assembler::notEqual, DONE_LABEL);    // If not recursive, ZF = 0 at this point (fail)
 709   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 710   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 711   jmp(DONE_LABEL);
 712 #endif // _LP64
 713 #if INCLUDE_RTM_OPT
 714   } // use_rtm()
 715 #endif
 716 
 717   bind(COUNT);
 718   // Count monitors in fast path
 719   increment(Address(thread, JavaThread::held_monitor_count_offset()));
 720   xorl(tmpReg, tmpReg); // Set ZF == 1
 721 
 722   bind(DONE_LABEL);
 723 
 724   // At DONE_LABEL the icc ZFlag is set as follows ...
 725   // fast_unlock uses the same protocol.
 726   // ZFlag == 1 -> Success
 727   // ZFlag == 0 -> Failure - force control through the slow path
 728 }
 729 
 730 // obj: object to unlock
 731 // box: box address (displaced header location), killed.  Must be EAX.
 732 // tmp: killed, cannot be obj nor box.
 733 //
 734 // Some commentary on balanced locking:
 735 //
 736 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 737 // Methods that don't have provably balanced locking are forced to run in the
 738 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 739 // The interpreter provides two properties:
 740 // I1:  At return-time the interpreter automatically and quietly unlocks any
 741 //      objects acquired the current activation (frame).  Recall that the
 742 //      interpreter maintains an on-stack list of locks currently held by
 743 //      a frame.
 744 // I2:  If a method attempts to unlock an object that is not held by the
 745 //      the frame the interpreter throws IMSX.
 746 //
 747 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 748 // B() doesn't have provably balanced locking so it runs in the interpreter.
 749 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 750 // is still locked by A().
 751 //
 752 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 753 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 754 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 755 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 756 // Arguably given that the spec legislates the JNI case as undefined our implementation
 757 // could reasonably *avoid* checking owner in fast_unlock().
 758 // In the interest of performance we elide m->Owner==Self check in unlock.
 759 // A perfectly viable alternative is to elide the owner check except when
 760 // Xcheck:jni is enabled.
 761 
 762 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, Register scrReg, bool use_rtm) {
 763   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 764   assert(boxReg == rax, "");
 765   assert_different_registers(objReg, boxReg, tmpReg);
 766 
 767   Label DONE_LABEL, Stacked, COUNT;
 768 
 769 #if INCLUDE_RTM_OPT
 770   if (UseRTMForStackLocks && use_rtm) {
 771     assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive");
 772     Label L_regular_unlock;
 773     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 774     andptr(tmpReg, markWord::lock_mask_in_place);                     // look at 2 lock bits
 775     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 01 unlocked
 776     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 777     xend();                                                           // otherwise end...
 778     jmp(DONE_LABEL);                                                  // ... and we're done
 779     bind(L_regular_unlock);
 780   }
 781 #endif
 782 
 783   if (LockingMode == LM_LEGACY) {
 784     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 785     jcc   (Assembler::zero, DONE_LABEL);                              // 0 indicates recursive stack-lock
 786   }
 787   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 788   if (LockingMode != LM_MONITOR) {
 789     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 790     jcc(Assembler::zero, Stacked);
 791   }
 792 
 793   // It's inflated.
 794   // If the owner is ANONYMOUS, we need to fix it -  in an outline stub.
 795   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t) ObjectMonitor::ANONYMOUS_OWNER);
 796 #ifdef _LP64
 797   if (!Compile::current()->output()->in_scratch_emit_size()) {
 798     C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmpReg, boxReg);
 799     Compile::current()->output()->add_stub(stub);
 800     jcc(Assembler::equal, stub->entry());
 801     bind(stub->continuation());
 802   } else
 803 #endif
 804   {
 805     // We can't easily implement this optimization on 32 bit because we don't have a thread register.
 806     // Call the slow-path instead.
 807     jcc(Assembler::notEqual, DONE_LABEL);
 808   }
 809 
 810 #if INCLUDE_RTM_OPT
 811   if (use_rtm) {
 812     Label L_regular_inflated_unlock;
 813     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 814     movptr(boxReg, Address(tmpReg, owner_offset));
 815     testptr(boxReg, boxReg);
 816     jccb(Assembler::notZero, L_regular_inflated_unlock);
 817     xend();
 818     jmp(DONE_LABEL);
 819     bind(L_regular_inflated_unlock);
 820   }
 821 #endif
 822 
 823   // Despite our balanced locking property we still check that m->_owner == Self
 824   // as java routines or native JNI code called by this thread might
 825   // have released the lock.
 826   // Refer to the comments in synchronizer.cpp for how we might encode extra
 827   // state in _succ so we can avoid fetching EntryList|cxq.
 828   //
 829   // If there's no contention try a 1-0 exit.  That is, exit without
 830   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 831   // we detect and recover from the race that the 1-0 exit admits.
 832   //
 833   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 834   // before it STs null into _owner, releasing the lock.  Updates
 835   // to data protected by the critical section must be visible before
 836   // we drop the lock (and thus before any other thread could acquire
 837   // the lock and observe the fields protected by the lock).
 838   // IA32's memory-model is SPO, so STs are ordered with respect to
 839   // each other and there's no need for an explicit barrier (fence).
 840   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 841 #ifndef _LP64
 842   // Note that we could employ various encoding schemes to reduce
 843   // the number of loads below (currently 4) to just 2 or 3.
 844   // Refer to the comments in synchronizer.cpp.
 845   // In practice the chain of fetches doesn't seem to impact performance, however.
 846   xorptr(boxReg, boxReg);
 847   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 848   jccb  (Assembler::notZero, DONE_LABEL);
 849   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 850   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 851   jccb  (Assembler::notZero, DONE_LABEL);
 852   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 853   jmpb  (DONE_LABEL);
 854 #else // _LP64
 855   // It's inflated
 856   Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath;
 857 
 858   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 859   jccb(Assembler::equal, LNotRecursive);
 860 
 861   // Recursive inflated unlock
 862   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 863   xorl(tmpReg, tmpReg); // Set ZF == 1
 864   jmp(DONE_LABEL);
 865 
 866   bind(LNotRecursive);
 867 
 868   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 869   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 870   jccb  (Assembler::notZero, CheckSucc);
 871   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 872   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 873   jmpb  (DONE_LABEL);
 874 
 875   // Try to avoid passing control into the slow_path ...
 876   bind  (CheckSucc);
 877 
 878   // The following optional optimization can be elided if necessary
 879   // Effectively: if (succ == null) goto slow path
 880   // The code reduces the window for a race, however,
 881   // and thus benefits performance.
 882   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 883   jccb  (Assembler::zero, LGoSlowPath);
 884 
 885   xorptr(boxReg, boxReg);
 886   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 887   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 888 
 889   // Memory barrier/fence
 890   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 891   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 892   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 893   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 894   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 895   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 896   lock(); addl(Address(rsp, 0), 0);
 897 
 898   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 899   jccb  (Assembler::notZero, LSuccess);
 900 
 901   // Rare inopportune interleaving - race.
 902   // The successor vanished in the small window above.
 903   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 904   // We need to ensure progress and succession.
 905   // Try to reacquire the lock.
 906   // If that fails then the new owner is responsible for succession and this
 907   // thread needs to take no further action and can exit via the fast path (success).
 908   // If the re-acquire succeeds then pass control into the slow path.
 909   // As implemented, this latter mode is horrible because we generated more
 910   // coherence traffic on the lock *and* artificially extended the critical section
 911   // length while by virtue of passing control into the slow path.
 912 
 913   // box is really RAX -- the following CMPXCHG depends on that binding
 914   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 915   movptr(scrReg, Address(r15_thread, JavaThread::lock_id_offset()));
 916   lock();
 917   cmpxchgptr(scrReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 918   // There's no successor so we tried to regrab the lock.
 919   // If that didn't work, then another thread grabbed the
 920   // lock so we're done (and exit was a success).
 921   jccb  (Assembler::notEqual, LSuccess);
 922   // Intentional fall-through into slow path
 923 
 924   bind  (LGoSlowPath);
 925   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 926   jmpb  (DONE_LABEL);
 927 
 928   bind  (LSuccess);
 929   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 930   jmpb  (DONE_LABEL);
 931 
 932 #endif
 933   if (LockingMode == LM_LEGACY) {
 934     bind  (Stacked);
 935     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 936     lock();
 937     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 938     jccb(Assembler::notZero, DONE_LABEL);
 939     // Count monitors in fast path
 940 #ifndef _LP64
 941     get_thread(tmpReg);
 942     decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 943 #else // _LP64
 944     decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 945 #endif
 946     xorl(tmpReg, tmpReg); // Set ZF == 1
 947   }
 948 
 949   // ZFlag == 1 -> Success
 950   // ZFlag == 0 -> Failure - force control through the slow path
 951   bind(DONE_LABEL);
 952 }
 953 
 954 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 955                                               Register t, Register thread) {
 956   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 957   assert(rax_reg == rax, "Used for CAS");
 958   assert_different_registers(obj, box, rax_reg, t, thread);
 959 
 960   // Handle inflated monitor.
 961   Label inflated;
 962   // Finish fast lock successfully.
 963   Label locked;
 964   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 965   Label slow_path;
 966 
 967   if (DiagnoseSyncOnValueBasedClasses != 0) {
 968     load_klass(rax_reg, obj, t);
 969     movl(rax_reg, Address(rax_reg, Klass::access_flags_offset()));
 970     testl(rax_reg, JVM_ACC_IS_VALUE_BASED_CLASS);
 971     jcc(Assembler::notZero, slow_path);
 972   }
 973 
 974   const Register mark = t;
 975 
 976   { // Lightweight Lock
 977 
 978     Label push;
 979 
 980     const Register top = box;
 981 
 982     // Load the mark.
 983     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 984 
 985     // Prefetch top.
 986     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 987 
 988     // Check for monitor (0b10).
 989     testptr(mark, markWord::monitor_value);
 990     jcc(Assembler::notZero, inflated);
 991 
 992     // Check if lock-stack is full.
 993     cmpl(top, LockStack::end_offset() - 1);
 994     jcc(Assembler::greater, slow_path);
 995 
 996     // Check if recursive.
 997     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 998     jccb(Assembler::equal, push);
 999 
1000     // Try to lock. Transition lock bits 0b01 => 0b00
1001     movptr(rax_reg, mark);
1002     orptr(rax_reg, markWord::unlocked_value);
1003     andptr(mark, ~(int32_t)markWord::unlocked_value);
1004     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
1005     jcc(Assembler::notEqual, slow_path);
1006 
1007     bind(push);
1008     // After successful lock, push object on lock-stack.
1009     movptr(Address(thread, top), obj);
1010     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
1011     xorl(rax_reg, rax_reg);
1012     jmpb(locked);
1013   }
1014 
1015   { // Handle inflated monitor.
1016     bind(inflated);
1017 
1018     const Register tagged_monitor = mark;
1019 
1020     // CAS owner (null => current thread).
1021     xorptr(rax_reg, rax_reg);
1022     movptr(box, Address(thread, JavaThread::lock_id_offset()));
1023     lock(); cmpxchgptr(box, Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1024     jccb(Assembler::equal, locked);
1025 
1026     // Check if recursive.
1027     cmpptr(box, rax_reg);
1028     jccb(Assembler::notEqual, slow_path);
1029 
1030     // Recursive.
1031     increment(Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1032     xorl(rax_reg, rax_reg);
1033   }
1034 
1035   bind(locked);
1036 #ifdef ASSERT
1037   // Check that locked label is reached with ZF set.
1038   Label zf_correct;
1039   Label zf_bad_zero;
1040   jcc(Assembler::zero, zf_correct);
1041   jmp(zf_bad_zero);
1042 #endif
1043 
1044   bind(slow_path);
1045 #ifdef ASSERT
1046   // Check that slow_path label is reached with ZF not set.
1047   jcc(Assembler::notZero, zf_correct);
1048   stop("Fast Lock ZF != 0");
1049   bind(zf_bad_zero);
1050   stop("Fast Lock ZF != 1");
1051   bind(zf_correct);
1052 #endif
1053   // C2 uses the value of ZF to determine the continuation.
1054 }
1055 
1056 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t1, Register t2, Register thread) {
1057   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
1058   assert(reg_rax == rax, "Used for CAS");
1059   assert_different_registers(obj, reg_rax, t1, t2);
1060 
1061   // Handle inflated monitor.
1062   Label inflated, inflated_check_lock_stack;
1063   // Finish fast unlock successfully.  MUST jump with ZF == 1
1064   Label unlocked;
1065 
1066   const Register mark = t1;
1067   const Register top = reg_rax;
1068 
1069   Label dummy;
1070   C2FastUnlockLightweightStub* stub = nullptr;
1071 
1072   if (!Compile::current()->output()->in_scratch_emit_size()) {
1073     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, t2, thread);
1074     Compile::current()->output()->add_stub(stub);
1075   }
1076 
1077   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
1078   Label& check_successor = stub == nullptr ? dummy : stub->check_successor();
1079 
1080   { // Lightweight Unlock
1081 
1082     // Load top.
1083     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
1084 
1085     // Prefetch mark.
1086     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
1087 
1088     // Check if obj is top of lock-stack.
1089     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
1090     // Top of lock stack was not obj. Must be monitor.
1091     jcc(Assembler::notEqual, inflated_check_lock_stack);
1092 
1093     // Pop lock-stack.
1094     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
1095     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
1096 
1097     // Check if recursive.
1098     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
1099     jcc(Assembler::equal, unlocked);
1100 
1101     // We elide the monitor check, let the CAS fail instead.
1102 
1103     // Try to unlock. Transition lock bits 0b00 => 0b01
1104     movptr(reg_rax, mark);
1105     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
1106     orptr(mark, markWord::unlocked_value);
1107     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
1108     jcc(Assembler::notEqual, push_and_slow_path);
1109     jmp(unlocked);
1110   }
1111 
1112 
1113   { // Handle inflated monitor.
1114     bind(inflated_check_lock_stack);
1115 #ifdef ASSERT
1116     Label check_done;
1117     subl(top, oopSize);
1118     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
1119     jcc(Assembler::below, check_done);
1120     cmpptr(obj, Address(thread, top));
1121     jccb(Assembler::notEqual, inflated_check_lock_stack);
1122     stop("Fast Unlock lock on stack");
1123     bind(check_done);
1124     testptr(mark, markWord::monitor_value);
1125     jccb(Assembler::notZero, inflated);
1126     stop("Fast Unlock not monitor");
1127 #endif
1128 
1129     bind(inflated);
1130 
1131     // mark contains the tagged ObjectMonitor*.
1132     const Register monitor = mark;
1133 
1134 #ifndef _LP64
1135     // Check if recursive.
1136     xorptr(reg_rax, reg_rax);
1137     orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1138     jcc(Assembler::notZero, check_successor);
1139 
1140     // Check if the entry lists are empty.
1141     movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
1142     orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
1143     jcc(Assembler::notZero, check_successor);
1144 
1145     // Release lock.
1146     movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
1147 #else // _LP64
1148     Label recursive;
1149 
1150     // Check if recursive.
1151     cmpptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
1152     jccb(Assembler::notEqual, recursive);
1153 
1154     // Check if the entry lists are empty.
1155     movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
1156     orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
1157     jcc(Assembler::notZero, check_successor);
1158 
1159     // Release lock.
1160     movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
1161     jmpb(unlocked);
1162 
1163     // Recursive unlock.
1164     bind(recursive);
1165     decrement(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1166     xorl(t1, t1);
1167 #endif
1168   }
1169 
1170   bind(unlocked);
1171   if (stub != nullptr) {
1172     bind(stub->unlocked_continuation());
1173   }
1174 
1175 #ifdef ASSERT
1176   // Check that unlocked label is reached with ZF set.
1177   Label zf_correct;
1178   jcc(Assembler::zero, zf_correct);
1179   stop("Fast Unlock ZF != 1");
1180 #endif
1181 
1182   if (stub != nullptr) {
1183     bind(stub->slow_path_continuation());
1184   }
1185 #ifdef ASSERT
1186   // Check that stub->continuation() label is reached with ZF not set.
1187   jccb(Assembler::notZero, zf_correct);
1188   stop("Fast Unlock ZF != 0");
1189   bind(zf_correct);
1190 #endif
1191   // C2 uses the value of ZF to determine the continuation.
1192 }
1193 
1194 //-------------------------------------------------------------------------------------------
1195 // Generic instructions support for use in .ad files C2 code generation
1196 
1197 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
1198   if (dst != src) {
1199     movdqu(dst, src);
1200   }
1201   if (opcode == Op_AbsVD) {
1202     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
1203   } else {
1204     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
1205     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1206   }
1207 }
1208 
1209 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
1210   if (opcode == Op_AbsVD) {
1211     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
1212   } else {
1213     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
1214     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
1215   }
1216 }
1217 
1218 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
1219   if (dst != src) {
1220     movdqu(dst, src);
1221   }
1222   if (opcode == Op_AbsVF) {
1223     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
1224   } else {
1225     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
1226     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1227   }
1228 }
1229 
1230 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
1231   if (opcode == Op_AbsVF) {
1232     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
1233   } else {
1234     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
1235     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
1236   }
1237 }
1238 
1239 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
1240   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1241   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
1242 
1243   if (opcode == Op_MinV) {
1244     if (elem_bt == T_BYTE) {
1245       pminsb(dst, src);
1246     } else if (elem_bt == T_SHORT) {
1247       pminsw(dst, src);
1248     } else if (elem_bt == T_INT) {
1249       pminsd(dst, src);
1250     } else {
1251       assert(elem_bt == T_LONG, "required");
1252       assert(tmp == xmm0, "required");
1253       assert_different_registers(dst, src, tmp);
1254       movdqu(xmm0, dst);
1255       pcmpgtq(xmm0, src);
1256       blendvpd(dst, src);  // xmm0 as mask
1257     }
1258   } else { // opcode == Op_MaxV
1259     if (elem_bt == T_BYTE) {
1260       pmaxsb(dst, src);
1261     } else if (elem_bt == T_SHORT) {
1262       pmaxsw(dst, src);
1263     } else if (elem_bt == T_INT) {
1264       pmaxsd(dst, src);
1265     } else {
1266       assert(elem_bt == T_LONG, "required");
1267       assert(tmp == xmm0, "required");
1268       assert_different_registers(dst, src, tmp);
1269       movdqu(xmm0, src);
1270       pcmpgtq(xmm0, dst);
1271       blendvpd(dst, src);  // xmm0 as mask
1272     }
1273   }
1274 }
1275 
1276 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1277                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1278                                  int vlen_enc) {
1279   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1280 
1281   if (opcode == Op_MinV) {
1282     if (elem_bt == T_BYTE) {
1283       vpminsb(dst, src1, src2, vlen_enc);
1284     } else if (elem_bt == T_SHORT) {
1285       vpminsw(dst, src1, src2, vlen_enc);
1286     } else if (elem_bt == T_INT) {
1287       vpminsd(dst, src1, src2, vlen_enc);
1288     } else {
1289       assert(elem_bt == T_LONG, "required");
1290       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1291         vpminsq(dst, src1, src2, vlen_enc);
1292       } else {
1293         assert_different_registers(dst, src1, src2);
1294         vpcmpgtq(dst, src1, src2, vlen_enc);
1295         vblendvpd(dst, src1, src2, dst, vlen_enc);
1296       }
1297     }
1298   } else { // opcode == Op_MaxV
1299     if (elem_bt == T_BYTE) {
1300       vpmaxsb(dst, src1, src2, vlen_enc);
1301     } else if (elem_bt == T_SHORT) {
1302       vpmaxsw(dst, src1, src2, vlen_enc);
1303     } else if (elem_bt == T_INT) {
1304       vpmaxsd(dst, src1, src2, vlen_enc);
1305     } else {
1306       assert(elem_bt == T_LONG, "required");
1307       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1308         vpmaxsq(dst, src1, src2, vlen_enc);
1309       } else {
1310         assert_different_registers(dst, src1, src2);
1311         vpcmpgtq(dst, src1, src2, vlen_enc);
1312         vblendvpd(dst, src2, src1, dst, vlen_enc);
1313       }
1314     }
1315   }
1316 }
1317 
1318 // Float/Double min max
1319 
1320 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1321                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1322                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1323                                    int vlen_enc) {
1324   assert(UseAVX > 0, "required");
1325   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1326          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1327   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1328   assert_different_registers(a, tmp, atmp, btmp);
1329   assert_different_registers(b, tmp, atmp, btmp);
1330 
1331   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1332   bool is_double_word = is_double_word_type(elem_bt);
1333 
1334   /* Note on 'non-obvious' assembly sequence:
1335    *
1336    * While there are vminps/vmaxps instructions, there are two important differences between hardware
1337    * and Java on how they handle floats:
1338    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
1339    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
1340    *
1341    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
1342    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
1343    *                (only useful when signs differ, noop otherwise)
1344    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
1345 
1346    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
1347    *   btmp = (b < +0.0) ? a : b
1348    *   atmp = (b < +0.0) ? b : a
1349    *   Tmp  = Max_Float(atmp , btmp)
1350    *   Res  = (atmp == NaN) ? atmp : Tmp
1351    */
1352 
1353   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
1354   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
1355   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
1356   XMMRegister mask;
1357 
1358   if (!is_double_word && is_min) {
1359     mask = a;
1360     vblend = &MacroAssembler::vblendvps;
1361     vmaxmin = &MacroAssembler::vminps;
1362     vcmp = &MacroAssembler::vcmpps;
1363   } else if (!is_double_word && !is_min) {
1364     mask = b;
1365     vblend = &MacroAssembler::vblendvps;
1366     vmaxmin = &MacroAssembler::vmaxps;
1367     vcmp = &MacroAssembler::vcmpps;
1368   } else if (is_double_word && is_min) {
1369     mask = a;
1370     vblend = &MacroAssembler::vblendvpd;
1371     vmaxmin = &MacroAssembler::vminpd;
1372     vcmp = &MacroAssembler::vcmppd;
1373   } else {
1374     assert(is_double_word && !is_min, "sanity");
1375     mask = b;
1376     vblend = &MacroAssembler::vblendvpd;
1377     vmaxmin = &MacroAssembler::vmaxpd;
1378     vcmp = &MacroAssembler::vcmppd;
1379   }
1380 
1381   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1382   XMMRegister maxmin, scratch;
1383   if (dst == btmp) {
1384     maxmin = btmp;
1385     scratch = tmp;
1386   } else {
1387     maxmin = tmp;
1388     scratch = btmp;
1389   }
1390 
1391   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1392   if (precompute_mask && !is_double_word) {
1393     vpsrad(tmp, mask, 32, vlen_enc);
1394     mask = tmp;
1395   } else if (precompute_mask && is_double_word) {
1396     vpxor(tmp, tmp, tmp, vlen_enc);
1397     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1398     mask = tmp;
1399   }
1400 
1401   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1402   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1403   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1404   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1405   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1406 }
1407 
1408 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1409                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1410                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1411                                     int vlen_enc) {
1412   assert(UseAVX > 2, "required");
1413   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1414          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1415   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1416   assert_different_registers(dst, a, atmp, btmp);
1417   assert_different_registers(dst, b, atmp, btmp);
1418 
1419   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1420   bool is_double_word = is_double_word_type(elem_bt);
1421   bool merge = true;
1422 
1423   if (!is_double_word && is_min) {
1424     evpmovd2m(ktmp, a, vlen_enc);
1425     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1426     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1427     vminps(dst, atmp, btmp, vlen_enc);
1428     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1429     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1430   } else if (!is_double_word && !is_min) {
1431     evpmovd2m(ktmp, b, vlen_enc);
1432     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1433     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1434     vmaxps(dst, atmp, btmp, vlen_enc);
1435     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1436     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1437   } else if (is_double_word && is_min) {
1438     evpmovq2m(ktmp, a, vlen_enc);
1439     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1440     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1441     vminpd(dst, atmp, btmp, vlen_enc);
1442     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1443     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1444   } else {
1445     assert(is_double_word && !is_min, "sanity");
1446     evpmovq2m(ktmp, b, vlen_enc);
1447     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1448     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1449     vmaxpd(dst, atmp, btmp, vlen_enc);
1450     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1451     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1452   }
1453 }
1454 
1455 // Float/Double signum
1456 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1457   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1458 
1459   Label DONE_LABEL;
1460 
1461   if (opcode == Op_SignumF) {
1462     assert(UseSSE > 0, "required");
1463     ucomiss(dst, zero);
1464     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1465     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1466     movflt(dst, one);
1467     jcc(Assembler::above, DONE_LABEL);
1468     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1469   } else if (opcode == Op_SignumD) {
1470     assert(UseSSE > 1, "required");
1471     ucomisd(dst, zero);
1472     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1473     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1474     movdbl(dst, one);
1475     jcc(Assembler::above, DONE_LABEL);
1476     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1477   }
1478 
1479   bind(DONE_LABEL);
1480 }
1481 
1482 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1483   if (sign) {
1484     pmovsxbw(dst, src);
1485   } else {
1486     pmovzxbw(dst, src);
1487   }
1488 }
1489 
1490 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1491   if (sign) {
1492     vpmovsxbw(dst, src, vector_len);
1493   } else {
1494     vpmovzxbw(dst, src, vector_len);
1495   }
1496 }
1497 
1498 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1499   if (sign) {
1500     vpmovsxbd(dst, src, vector_len);
1501   } else {
1502     vpmovzxbd(dst, src, vector_len);
1503   }
1504 }
1505 
1506 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1507   if (sign) {
1508     vpmovsxwd(dst, src, vector_len);
1509   } else {
1510     vpmovzxwd(dst, src, vector_len);
1511   }
1512 }
1513 
1514 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1515                                      int shift, int vector_len) {
1516   if (opcode == Op_RotateLeftV) {
1517     if (etype == T_INT) {
1518       evprold(dst, src, shift, vector_len);
1519     } else {
1520       assert(etype == T_LONG, "expected type T_LONG");
1521       evprolq(dst, src, shift, vector_len);
1522     }
1523   } else {
1524     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1525     if (etype == T_INT) {
1526       evprord(dst, src, shift, vector_len);
1527     } else {
1528       assert(etype == T_LONG, "expected type T_LONG");
1529       evprorq(dst, src, shift, vector_len);
1530     }
1531   }
1532 }
1533 
1534 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1535                                      XMMRegister shift, int vector_len) {
1536   if (opcode == Op_RotateLeftV) {
1537     if (etype == T_INT) {
1538       evprolvd(dst, src, shift, vector_len);
1539     } else {
1540       assert(etype == T_LONG, "expected type T_LONG");
1541       evprolvq(dst, src, shift, vector_len);
1542     }
1543   } else {
1544     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1545     if (etype == T_INT) {
1546       evprorvd(dst, src, shift, vector_len);
1547     } else {
1548       assert(etype == T_LONG, "expected type T_LONG");
1549       evprorvq(dst, src, shift, vector_len);
1550     }
1551   }
1552 }
1553 
1554 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1555   if (opcode == Op_RShiftVI) {
1556     psrad(dst, shift);
1557   } else if (opcode == Op_LShiftVI) {
1558     pslld(dst, shift);
1559   } else {
1560     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1561     psrld(dst, shift);
1562   }
1563 }
1564 
1565 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1566   switch (opcode) {
1567     case Op_RShiftVI:  psrad(dst, shift); break;
1568     case Op_LShiftVI:  pslld(dst, shift); break;
1569     case Op_URShiftVI: psrld(dst, shift); break;
1570 
1571     default: assert(false, "%s", NodeClassNames[opcode]);
1572   }
1573 }
1574 
1575 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1576   if (opcode == Op_RShiftVI) {
1577     vpsrad(dst, nds, shift, vector_len);
1578   } else if (opcode == Op_LShiftVI) {
1579     vpslld(dst, nds, shift, vector_len);
1580   } else {
1581     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1582     vpsrld(dst, nds, shift, vector_len);
1583   }
1584 }
1585 
1586 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1587   switch (opcode) {
1588     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1589     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1590     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1591 
1592     default: assert(false, "%s", NodeClassNames[opcode]);
1593   }
1594 }
1595 
1596 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1597   switch (opcode) {
1598     case Op_RShiftVB:  // fall-through
1599     case Op_RShiftVS:  psraw(dst, shift); break;
1600 
1601     case Op_LShiftVB:  // fall-through
1602     case Op_LShiftVS:  psllw(dst, shift);   break;
1603 
1604     case Op_URShiftVS: // fall-through
1605     case Op_URShiftVB: psrlw(dst, shift);  break;
1606 
1607     default: assert(false, "%s", NodeClassNames[opcode]);
1608   }
1609 }
1610 
1611 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1612   switch (opcode) {
1613     case Op_RShiftVB:  // fall-through
1614     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1615 
1616     case Op_LShiftVB:  // fall-through
1617     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1618 
1619     case Op_URShiftVS: // fall-through
1620     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1621 
1622     default: assert(false, "%s", NodeClassNames[opcode]);
1623   }
1624 }
1625 
1626 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1627   switch (opcode) {
1628     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1629     case Op_LShiftVL:  psllq(dst, shift); break;
1630     case Op_URShiftVL: psrlq(dst, shift); break;
1631 
1632     default: assert(false, "%s", NodeClassNames[opcode]);
1633   }
1634 }
1635 
1636 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1637   if (opcode == Op_RShiftVL) {
1638     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1639   } else if (opcode == Op_LShiftVL) {
1640     psllq(dst, shift);
1641   } else {
1642     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1643     psrlq(dst, shift);
1644   }
1645 }
1646 
1647 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1648   switch (opcode) {
1649     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1650     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1651     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1652 
1653     default: assert(false, "%s", NodeClassNames[opcode]);
1654   }
1655 }
1656 
1657 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1658   if (opcode == Op_RShiftVL) {
1659     evpsraq(dst, nds, shift, vector_len);
1660   } else if (opcode == Op_LShiftVL) {
1661     vpsllq(dst, nds, shift, vector_len);
1662   } else {
1663     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1664     vpsrlq(dst, nds, shift, vector_len);
1665   }
1666 }
1667 
1668 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1669   switch (opcode) {
1670     case Op_RShiftVB:  // fall-through
1671     case Op_RShiftVS:  // fall-through
1672     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1673 
1674     case Op_LShiftVB:  // fall-through
1675     case Op_LShiftVS:  // fall-through
1676     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1677 
1678     case Op_URShiftVB: // fall-through
1679     case Op_URShiftVS: // fall-through
1680     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1681 
1682     default: assert(false, "%s", NodeClassNames[opcode]);
1683   }
1684 }
1685 
1686 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1687   switch (opcode) {
1688     case Op_RShiftVB:  // fall-through
1689     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1690 
1691     case Op_LShiftVB:  // fall-through
1692     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1693 
1694     case Op_URShiftVB: // fall-through
1695     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1696 
1697     default: assert(false, "%s", NodeClassNames[opcode]);
1698   }
1699 }
1700 
1701 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1702   assert(UseAVX >= 2, "required");
1703   switch (opcode) {
1704     case Op_RShiftVL: {
1705       if (UseAVX > 2) {
1706         assert(tmp == xnoreg, "not used");
1707         if (!VM_Version::supports_avx512vl()) {
1708           vlen_enc = Assembler::AVX_512bit;
1709         }
1710         evpsravq(dst, src, shift, vlen_enc);
1711       } else {
1712         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1713         vpsrlvq(dst, src, shift, vlen_enc);
1714         vpsrlvq(tmp, tmp, shift, vlen_enc);
1715         vpxor(dst, dst, tmp, vlen_enc);
1716         vpsubq(dst, dst, tmp, vlen_enc);
1717       }
1718       break;
1719     }
1720     case Op_LShiftVL: {
1721       assert(tmp == xnoreg, "not used");
1722       vpsllvq(dst, src, shift, vlen_enc);
1723       break;
1724     }
1725     case Op_URShiftVL: {
1726       assert(tmp == xnoreg, "not used");
1727       vpsrlvq(dst, src, shift, vlen_enc);
1728       break;
1729     }
1730     default: assert(false, "%s", NodeClassNames[opcode]);
1731   }
1732 }
1733 
1734 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1735 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1736   assert(opcode == Op_LShiftVB ||
1737          opcode == Op_RShiftVB ||
1738          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1739   bool sign = (opcode != Op_URShiftVB);
1740   assert(vector_len == 0, "required");
1741   vextendbd(sign, dst, src, 1);
1742   vpmovzxbd(vtmp, shift, 1);
1743   varshiftd(opcode, dst, dst, vtmp, 1);
1744   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1745   vextracti128_high(vtmp, dst);
1746   vpackusdw(dst, dst, vtmp, 0);
1747 }
1748 
1749 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1750 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1751   assert(opcode == Op_LShiftVB ||
1752          opcode == Op_RShiftVB ||
1753          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1754   bool sign = (opcode != Op_URShiftVB);
1755   int ext_vector_len = vector_len + 1;
1756   vextendbw(sign, dst, src, ext_vector_len);
1757   vpmovzxbw(vtmp, shift, ext_vector_len);
1758   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1759   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1760   if (vector_len == 0) {
1761     vextracti128_high(vtmp, dst);
1762     vpackuswb(dst, dst, vtmp, vector_len);
1763   } else {
1764     vextracti64x4_high(vtmp, dst);
1765     vpackuswb(dst, dst, vtmp, vector_len);
1766     vpermq(dst, dst, 0xD8, vector_len);
1767   }
1768 }
1769 
1770 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1771   switch(typ) {
1772     case T_BYTE:
1773       pinsrb(dst, val, idx);
1774       break;
1775     case T_SHORT:
1776       pinsrw(dst, val, idx);
1777       break;
1778     case T_INT:
1779       pinsrd(dst, val, idx);
1780       break;
1781     case T_LONG:
1782       pinsrq(dst, val, idx);
1783       break;
1784     default:
1785       assert(false,"Should not reach here.");
1786       break;
1787   }
1788 }
1789 
1790 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1791   switch(typ) {
1792     case T_BYTE:
1793       vpinsrb(dst, src, val, idx);
1794       break;
1795     case T_SHORT:
1796       vpinsrw(dst, src, val, idx);
1797       break;
1798     case T_INT:
1799       vpinsrd(dst, src, val, idx);
1800       break;
1801     case T_LONG:
1802       vpinsrq(dst, src, val, idx);
1803       break;
1804     default:
1805       assert(false,"Should not reach here.");
1806       break;
1807   }
1808 }
1809 
1810 #ifdef _LP64
1811 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt,
1812                                                 XMMRegister dst, Register base,
1813                                                 Register idx_base,
1814                                                 Register offset, Register mask,
1815                                                 Register mask_idx, Register rtmp,
1816                                                 int vlen_enc) {
1817   vpxor(dst, dst, dst, vlen_enc);
1818   if (elem_bt == T_SHORT) {
1819     for (int i = 0; i < 4; i++) {
1820       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1821       Label skip_load;
1822       btq(mask, mask_idx);
1823       jccb(Assembler::carryClear, skip_load);
1824       movl(rtmp, Address(idx_base, i * 4));
1825       if (offset != noreg) {
1826         addl(rtmp, offset);
1827       }
1828       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1829       bind(skip_load);
1830       incq(mask_idx);
1831     }
1832   } else {
1833     assert(elem_bt == T_BYTE, "");
1834     for (int i = 0; i < 8; i++) {
1835       // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
1836       Label skip_load;
1837       btq(mask, mask_idx);
1838       jccb(Assembler::carryClear, skip_load);
1839       movl(rtmp, Address(idx_base, i * 4));
1840       if (offset != noreg) {
1841         addl(rtmp, offset);
1842       }
1843       pinsrb(dst, Address(base, rtmp), i);
1844       bind(skip_load);
1845       incq(mask_idx);
1846     }
1847   }
1848 }
1849 #endif // _LP64
1850 
1851 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst,
1852                                          Register base, Register idx_base,
1853                                          Register offset, Register rtmp,
1854                                          int vlen_enc) {
1855   vpxor(dst, dst, dst, vlen_enc);
1856   if (elem_bt == T_SHORT) {
1857     for (int i = 0; i < 4; i++) {
1858       // dst[i] = src[offset + idx_base[i]]
1859       movl(rtmp, Address(idx_base, i * 4));
1860       if (offset != noreg) {
1861         addl(rtmp, offset);
1862       }
1863       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1864     }
1865   } else {
1866     assert(elem_bt == T_BYTE, "");
1867     for (int i = 0; i < 8; i++) {
1868       // dst[i] = src[offset + idx_base[i]]
1869       movl(rtmp, Address(idx_base, i * 4));
1870       if (offset != noreg) {
1871         addl(rtmp, offset);
1872       }
1873       pinsrb(dst, Address(base, rtmp), i);
1874     }
1875   }
1876 }
1877 
1878 /*
1879  * Gather using hybrid algorithm, first partially unroll scalar loop
1880  * to accumulate values from gather indices into a quad-word(64bit) slice.
1881  * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1882  * permutation to place the slice into appropriate vector lane
1883  * locations in destination vector. Following pseudo code describes the
1884  * algorithm in detail:
1885  *
1886  * DST_VEC = ZERO_VEC
1887  * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1888  * TWO_VEC    = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1889  * FOREACH_ITER:
1890  *     TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1891  *     TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1892  *     DST_VEC = DST_VEC OR TEMP_PERM_VEC
1893  *     PERM_INDEX = PERM_INDEX - TWO_VEC
1894  *
1895  * With each iteration, doubleword permute indices (0,1) corresponding
1896  * to gathered quadword gets right shifted by two lane positions.
1897  *
1898  */
1899 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1900                                         Register base, Register idx_base,
1901                                         Register offset, Register mask,
1902                                         XMMRegister xtmp1, XMMRegister xtmp2,
1903                                         XMMRegister temp_dst, Register rtmp,
1904                                         Register mask_idx, Register length,
1905                                         int vector_len, int vlen_enc) {
1906   Label GATHER8_LOOP;
1907   assert(is_subword_type(elem_ty), "");
1908   movl(length, vector_len);
1909   vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1910   vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1911   vallones(xtmp2, vlen_enc);
1912   vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1913   vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1914   load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1915 
1916   bind(GATHER8_LOOP);
1917     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1918     if (mask == noreg) {
1919       vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc);
1920     } else {
1921       LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc));
1922     }
1923     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1924     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1925     // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1926     vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1927     // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1928     vpor(dst, dst, temp_dst, vlen_enc);
1929     addptr(idx_base,  32 >> (type2aelembytes(elem_ty) - 1));
1930     subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1931     jcc(Assembler::notEqual, GATHER8_LOOP);
1932 }
1933 
1934 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1935   switch(typ) {
1936     case T_INT:
1937       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1938       break;
1939     case T_FLOAT:
1940       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1941       break;
1942     case T_LONG:
1943       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1944       break;
1945     case T_DOUBLE:
1946       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1947       break;
1948     default:
1949       assert(false,"Should not reach here.");
1950       break;
1951   }
1952 }
1953 
1954 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1955   switch(typ) {
1956     case T_INT:
1957       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1958       break;
1959     case T_FLOAT:
1960       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1961       break;
1962     case T_LONG:
1963       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1964       break;
1965     case T_DOUBLE:
1966       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1967       break;
1968     default:
1969       assert(false,"Should not reach here.");
1970       break;
1971   }
1972 }
1973 
1974 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1975   switch(typ) {
1976     case T_INT:
1977       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1978       break;
1979     case T_FLOAT:
1980       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1981       break;
1982     case T_LONG:
1983       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1984       break;
1985     case T_DOUBLE:
1986       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1987       break;
1988     default:
1989       assert(false,"Should not reach here.");
1990       break;
1991   }
1992 }
1993 
1994 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1995   if (vlen_in_bytes <= 16) {
1996     pxor (dst, dst);
1997     psubb(dst, src);
1998     switch (elem_bt) {
1999       case T_BYTE:   /* nothing to do */ break;
2000       case T_SHORT:  pmovsxbw(dst, dst); break;
2001       case T_INT:    pmovsxbd(dst, dst); break;
2002       case T_FLOAT:  pmovsxbd(dst, dst); break;
2003       case T_LONG:   pmovsxbq(dst, dst); break;
2004       case T_DOUBLE: pmovsxbq(dst, dst); break;
2005 
2006       default: assert(false, "%s", type2name(elem_bt));
2007     }
2008   } else {
2009     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
2010     int vlen_enc = vector_length_encoding(vlen_in_bytes);
2011 
2012     vpxor (dst, dst, dst, vlen_enc);
2013     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
2014 
2015     switch (elem_bt) {
2016       case T_BYTE:   /* nothing to do */            break;
2017       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
2018       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
2019       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
2020       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
2021       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
2022 
2023       default: assert(false, "%s", type2name(elem_bt));
2024     }
2025   }
2026 }
2027 
2028 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
2029   if (novlbwdq) {
2030     vpmovsxbd(xtmp, src, vlen_enc);
2031     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
2032             Assembler::eq, true, vlen_enc, noreg);
2033   } else {
2034     vpxor(xtmp, xtmp, xtmp, vlen_enc);
2035     vpsubb(xtmp, xtmp, src, vlen_enc);
2036     evpmovb2m(dst, xtmp, vlen_enc);
2037   }
2038 }
2039 
2040 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
2041   switch (vlen_in_bytes) {
2042     case 4:  movdl(dst, src);   break;
2043     case 8:  movq(dst, src);    break;
2044     case 16: movdqu(dst, src);  break;
2045     case 32: vmovdqu(dst, src); break;
2046     case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
2047     default: ShouldNotReachHere();
2048   }
2049 }
2050 
2051 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
2052   assert(rscratch != noreg || always_reachable(src), "missing");
2053 
2054   if (reachable(src)) {
2055     load_vector(dst, as_Address(src), vlen_in_bytes);
2056   } else {
2057     lea(rscratch, src);
2058     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
2059   }
2060 }
2061 
2062 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
2063   int vlen_enc = vector_length_encoding(vlen);
2064   if (VM_Version::supports_avx()) {
2065     if (bt == T_LONG) {
2066       if (VM_Version::supports_avx2()) {
2067         vpbroadcastq(dst, src, vlen_enc);
2068       } else {
2069         vmovddup(dst, src, vlen_enc);
2070       }
2071     } else if (bt == T_DOUBLE) {
2072       if (vlen_enc != Assembler::AVX_128bit) {
2073         vbroadcastsd(dst, src, vlen_enc, noreg);
2074       } else {
2075         vmovddup(dst, src, vlen_enc);
2076       }
2077     } else {
2078       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
2079         vpbroadcastd(dst, src, vlen_enc);
2080       } else {
2081         vbroadcastss(dst, src, vlen_enc);
2082       }
2083     }
2084   } else if (VM_Version::supports_sse3()) {
2085     movddup(dst, src);
2086   } else {
2087     movq(dst, src);
2088     if (vlen == 16) {
2089       punpcklqdq(dst, dst);
2090     }
2091   }
2092 }
2093 
2094 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
2095   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
2096   int offset = exact_log2(type2aelembytes(bt)) << 6;
2097   if (is_floating_point_type(bt)) {
2098     offset += 128;
2099   }
2100   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
2101   load_vector(dst, addr, vlen_in_bytes);
2102 }
2103 
2104 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
2105 
2106 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
2107   int vector_len = Assembler::AVX_128bit;
2108 
2109   switch (opcode) {
2110     case Op_AndReductionV:  pand(dst, src); break;
2111     case Op_OrReductionV:   por (dst, src); break;
2112     case Op_XorReductionV:  pxor(dst, src); break;
2113     case Op_MinReductionV:
2114       switch (typ) {
2115         case T_BYTE:        pminsb(dst, src); break;
2116         case T_SHORT:       pminsw(dst, src); break;
2117         case T_INT:         pminsd(dst, src); break;
2118         case T_LONG:        assert(UseAVX > 2, "required");
2119                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
2120         default:            assert(false, "wrong type");
2121       }
2122       break;
2123     case Op_MaxReductionV:
2124       switch (typ) {
2125         case T_BYTE:        pmaxsb(dst, src); break;
2126         case T_SHORT:       pmaxsw(dst, src); break;
2127         case T_INT:         pmaxsd(dst, src); break;
2128         case T_LONG:        assert(UseAVX > 2, "required");
2129                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
2130         default:            assert(false, "wrong type");
2131       }
2132       break;
2133     case Op_AddReductionVF: addss(dst, src); break;
2134     case Op_AddReductionVD: addsd(dst, src); break;
2135     case Op_AddReductionVI:
2136       switch (typ) {
2137         case T_BYTE:        paddb(dst, src); break;
2138         case T_SHORT:       paddw(dst, src); break;
2139         case T_INT:         paddd(dst, src); break;
2140         default:            assert(false, "wrong type");
2141       }
2142       break;
2143     case Op_AddReductionVL: paddq(dst, src); break;
2144     case Op_MulReductionVF: mulss(dst, src); break;
2145     case Op_MulReductionVD: mulsd(dst, src); break;
2146     case Op_MulReductionVI:
2147       switch (typ) {
2148         case T_SHORT:       pmullw(dst, src); break;
2149         case T_INT:         pmulld(dst, src); break;
2150         default:            assert(false, "wrong type");
2151       }
2152       break;
2153     case Op_MulReductionVL: assert(UseAVX > 2, "required");
2154                             evpmullq(dst, dst, src, vector_len); break;
2155     default:                assert(false, "wrong opcode");
2156   }
2157 }
2158 
2159 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
2160   int vector_len = Assembler::AVX_256bit;
2161 
2162   switch (opcode) {
2163     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
2164     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
2165     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
2166     case Op_MinReductionV:
2167       switch (typ) {
2168         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
2169         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
2170         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
2171         case T_LONG:        assert(UseAVX > 2, "required");
2172                             vpminsq(dst, src1, src2, vector_len); break;
2173         default:            assert(false, "wrong type");
2174       }
2175       break;
2176     case Op_MaxReductionV:
2177       switch (typ) {
2178         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
2179         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
2180         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
2181         case T_LONG:        assert(UseAVX > 2, "required");
2182                             vpmaxsq(dst, src1, src2, vector_len); break;
2183         default:            assert(false, "wrong type");
2184       }
2185       break;
2186     case Op_AddReductionVI:
2187       switch (typ) {
2188         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
2189         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
2190         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
2191         default:            assert(false, "wrong type");
2192       }
2193       break;
2194     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
2195     case Op_MulReductionVI:
2196       switch (typ) {
2197         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
2198         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
2199         default:            assert(false, "wrong type");
2200       }
2201       break;
2202     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
2203     default:                assert(false, "wrong opcode");
2204   }
2205 }
2206 
2207 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
2208                                   XMMRegister dst, XMMRegister src,
2209                                   XMMRegister vtmp1, XMMRegister vtmp2) {
2210   switch (opcode) {
2211     case Op_AddReductionVF:
2212     case Op_MulReductionVF:
2213       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
2214       break;
2215 
2216     case Op_AddReductionVD:
2217     case Op_MulReductionVD:
2218       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
2219       break;
2220 
2221     default: assert(false, "wrong opcode");
2222   }
2223 }
2224 
2225 void C2_MacroAssembler::reduceB(int opcode, int vlen,
2226                              Register dst, Register src1, XMMRegister src2,
2227                              XMMRegister vtmp1, XMMRegister vtmp2) {
2228   switch (vlen) {
2229     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2230     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2231     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2232     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2233 
2234     default: assert(false, "wrong vector length");
2235   }
2236 }
2237 
2238 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
2239                              Register dst, Register src1, XMMRegister src2,
2240                              XMMRegister vtmp1, XMMRegister vtmp2) {
2241   switch (vlen) {
2242     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2243     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2244     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2245     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2246 
2247     default: assert(false, "wrong vector length");
2248   }
2249 }
2250 
2251 void C2_MacroAssembler::reduceS(int opcode, int vlen,
2252                              Register dst, Register src1, XMMRegister src2,
2253                              XMMRegister vtmp1, XMMRegister vtmp2) {
2254   switch (vlen) {
2255     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2256     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2257     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2258     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2259 
2260     default: assert(false, "wrong vector length");
2261   }
2262 }
2263 
2264 void C2_MacroAssembler::reduceI(int opcode, int vlen,
2265                              Register dst, Register src1, XMMRegister src2,
2266                              XMMRegister vtmp1, XMMRegister vtmp2) {
2267   switch (vlen) {
2268     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2269     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2270     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2271     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2272 
2273     default: assert(false, "wrong vector length");
2274   }
2275 }
2276 
2277 #ifdef _LP64
2278 void C2_MacroAssembler::reduceL(int opcode, int vlen,
2279                              Register dst, Register src1, XMMRegister src2,
2280                              XMMRegister vtmp1, XMMRegister vtmp2) {
2281   switch (vlen) {
2282     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2283     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2284     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2285 
2286     default: assert(false, "wrong vector length");
2287   }
2288 }
2289 #endif // _LP64
2290 
2291 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2292   switch (vlen) {
2293     case 2:
2294       assert(vtmp2 == xnoreg, "");
2295       reduce2F(opcode, dst, src, vtmp1);
2296       break;
2297     case 4:
2298       assert(vtmp2 == xnoreg, "");
2299       reduce4F(opcode, dst, src, vtmp1);
2300       break;
2301     case 8:
2302       reduce8F(opcode, dst, src, vtmp1, vtmp2);
2303       break;
2304     case 16:
2305       reduce16F(opcode, dst, src, vtmp1, vtmp2);
2306       break;
2307     default: assert(false, "wrong vector length");
2308   }
2309 }
2310 
2311 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2312   switch (vlen) {
2313     case 2:
2314       assert(vtmp2 == xnoreg, "");
2315       reduce2D(opcode, dst, src, vtmp1);
2316       break;
2317     case 4:
2318       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2319       break;
2320     case 8:
2321       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2322       break;
2323     default: assert(false, "wrong vector length");
2324   }
2325 }
2326 
2327 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2328   if (opcode == Op_AddReductionVI) {
2329     if (vtmp1 != src2) {
2330       movdqu(vtmp1, src2);
2331     }
2332     phaddd(vtmp1, vtmp1);
2333   } else {
2334     pshufd(vtmp1, src2, 0x1);
2335     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2336   }
2337   movdl(vtmp2, src1);
2338   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2339   movdl(dst, vtmp1);
2340 }
2341 
2342 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2343   if (opcode == Op_AddReductionVI) {
2344     if (vtmp1 != src2) {
2345       movdqu(vtmp1, src2);
2346     }
2347     phaddd(vtmp1, src2);
2348     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2349   } else {
2350     pshufd(vtmp2, src2, 0xE);
2351     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2352     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2353   }
2354 }
2355 
2356 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2357   if (opcode == Op_AddReductionVI) {
2358     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2359     vextracti128_high(vtmp2, vtmp1);
2360     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2361     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2362   } else {
2363     vextracti128_high(vtmp1, src2);
2364     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2365     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2366   }
2367 }
2368 
2369 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2370   vextracti64x4_high(vtmp2, src2);
2371   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2372   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2373 }
2374 
2375 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2376   pshufd(vtmp2, src2, 0x1);
2377   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2378   movdqu(vtmp1, vtmp2);
2379   psrldq(vtmp1, 2);
2380   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2381   movdqu(vtmp2, vtmp1);
2382   psrldq(vtmp2, 1);
2383   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2384   movdl(vtmp2, src1);
2385   pmovsxbd(vtmp1, vtmp1);
2386   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2387   pextrb(dst, vtmp1, 0x0);
2388   movsbl(dst, dst);
2389 }
2390 
2391 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2392   pshufd(vtmp1, src2, 0xE);
2393   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2394   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2395 }
2396 
2397 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2398   vextracti128_high(vtmp2, src2);
2399   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2400   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2401 }
2402 
2403 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2404   vextracti64x4_high(vtmp1, src2);
2405   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2406   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2407 }
2408 
2409 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2410   pmovsxbw(vtmp2, src2);
2411   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2412 }
2413 
2414 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2415   if (UseAVX > 1) {
2416     int vector_len = Assembler::AVX_256bit;
2417     vpmovsxbw(vtmp1, src2, vector_len);
2418     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2419   } else {
2420     pmovsxbw(vtmp2, src2);
2421     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2422     pshufd(vtmp2, src2, 0x1);
2423     pmovsxbw(vtmp2, src2);
2424     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2425   }
2426 }
2427 
2428 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2429   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2430     int vector_len = Assembler::AVX_512bit;
2431     vpmovsxbw(vtmp1, src2, vector_len);
2432     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2433   } else {
2434     assert(UseAVX >= 2,"Should not reach here.");
2435     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2436     vextracti128_high(vtmp2, src2);
2437     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2438   }
2439 }
2440 
2441 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2442   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2443   vextracti64x4_high(vtmp2, src2);
2444   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2445 }
2446 
2447 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2448   if (opcode == Op_AddReductionVI) {
2449     if (vtmp1 != src2) {
2450       movdqu(vtmp1, src2);
2451     }
2452     phaddw(vtmp1, vtmp1);
2453     phaddw(vtmp1, vtmp1);
2454   } else {
2455     pshufd(vtmp2, src2, 0x1);
2456     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2457     movdqu(vtmp1, vtmp2);
2458     psrldq(vtmp1, 2);
2459     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2460   }
2461   movdl(vtmp2, src1);
2462   pmovsxwd(vtmp1, vtmp1);
2463   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2464   pextrw(dst, vtmp1, 0x0);
2465   movswl(dst, dst);
2466 }
2467 
2468 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2469   if (opcode == Op_AddReductionVI) {
2470     if (vtmp1 != src2) {
2471       movdqu(vtmp1, src2);
2472     }
2473     phaddw(vtmp1, src2);
2474   } else {
2475     pshufd(vtmp1, src2, 0xE);
2476     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2477   }
2478   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2479 }
2480 
2481 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2482   if (opcode == Op_AddReductionVI) {
2483     int vector_len = Assembler::AVX_256bit;
2484     vphaddw(vtmp2, src2, src2, vector_len);
2485     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2486   } else {
2487     vextracti128_high(vtmp2, src2);
2488     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2489   }
2490   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2491 }
2492 
2493 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2494   int vector_len = Assembler::AVX_256bit;
2495   vextracti64x4_high(vtmp1, src2);
2496   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2497   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2498 }
2499 
2500 #ifdef _LP64
2501 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2502   pshufd(vtmp2, src2, 0xE);
2503   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2504   movdq(vtmp1, src1);
2505   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2506   movdq(dst, vtmp1);
2507 }
2508 
2509 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2510   vextracti128_high(vtmp1, src2);
2511   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2512   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2513 }
2514 
2515 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2516   vextracti64x4_high(vtmp2, src2);
2517   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2518   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2519 }
2520 
2521 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2522   mov64(temp, -1L);
2523   bzhiq(temp, temp, len);
2524   kmovql(dst, temp);
2525 }
2526 #endif // _LP64
2527 
2528 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2529   reduce_operation_128(T_FLOAT, opcode, dst, src);
2530   pshufd(vtmp, src, 0x1);
2531   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2532 }
2533 
2534 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2535   reduce2F(opcode, dst, src, vtmp);
2536   pshufd(vtmp, src, 0x2);
2537   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2538   pshufd(vtmp, src, 0x3);
2539   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2540 }
2541 
2542 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2543   reduce4F(opcode, dst, src, vtmp2);
2544   vextractf128_high(vtmp2, src);
2545   reduce4F(opcode, dst, vtmp2, vtmp1);
2546 }
2547 
2548 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2549   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2550   vextracti64x4_high(vtmp1, src);
2551   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2552 }
2553 
2554 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2555   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2556   pshufd(vtmp, src, 0xE);
2557   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2558 }
2559 
2560 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2561   reduce2D(opcode, dst, src, vtmp2);
2562   vextractf128_high(vtmp2, src);
2563   reduce2D(opcode, dst, vtmp2, vtmp1);
2564 }
2565 
2566 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2567   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2568   vextracti64x4_high(vtmp1, src);
2569   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2570 }
2571 
2572 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2573   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2574 }
2575 
2576 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2577   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2578 }
2579 
2580 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2581                                  int vec_enc) {
2582   switch(elem_bt) {
2583     case T_INT:
2584     case T_FLOAT:
2585       vmaskmovps(dst, src, mask, vec_enc);
2586       break;
2587     case T_LONG:
2588     case T_DOUBLE:
2589       vmaskmovpd(dst, src, mask, vec_enc);
2590       break;
2591     default:
2592       fatal("Unsupported type %s", type2name(elem_bt));
2593       break;
2594   }
2595 }
2596 
2597 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2598                                  int vec_enc) {
2599   switch(elem_bt) {
2600     case T_INT:
2601     case T_FLOAT:
2602       vmaskmovps(dst, src, mask, vec_enc);
2603       break;
2604     case T_LONG:
2605     case T_DOUBLE:
2606       vmaskmovpd(dst, src, mask, vec_enc);
2607       break;
2608     default:
2609       fatal("Unsupported type %s", type2name(elem_bt));
2610       break;
2611   }
2612 }
2613 
2614 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2615                                           XMMRegister dst, XMMRegister src,
2616                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2617                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2618   const int permconst[] = {1, 14};
2619   XMMRegister wsrc = src;
2620   XMMRegister wdst = xmm_0;
2621   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2622 
2623   int vlen_enc = Assembler::AVX_128bit;
2624   if (vlen == 16) {
2625     vlen_enc = Assembler::AVX_256bit;
2626   }
2627 
2628   for (int i = log2(vlen) - 1; i >=0; i--) {
2629     if (i == 0 && !is_dst_valid) {
2630       wdst = dst;
2631     }
2632     if (i == 3) {
2633       vextracti64x4_high(wtmp, wsrc);
2634     } else if (i == 2) {
2635       vextracti128_high(wtmp, wsrc);
2636     } else { // i = [0,1]
2637       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2638     }
2639     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2640     wsrc = wdst;
2641     vlen_enc = Assembler::AVX_128bit;
2642   }
2643   if (is_dst_valid) {
2644     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2645   }
2646 }
2647 
2648 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2649                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2650                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2651   XMMRegister wsrc = src;
2652   XMMRegister wdst = xmm_0;
2653   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2654   int vlen_enc = Assembler::AVX_128bit;
2655   if (vlen == 8) {
2656     vlen_enc = Assembler::AVX_256bit;
2657   }
2658   for (int i = log2(vlen) - 1; i >=0; i--) {
2659     if (i == 0 && !is_dst_valid) {
2660       wdst = dst;
2661     }
2662     if (i == 1) {
2663       vextracti128_high(wtmp, wsrc);
2664     } else if (i == 2) {
2665       vextracti64x4_high(wtmp, wsrc);
2666     } else {
2667       assert(i == 0, "%d", i);
2668       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2669     }
2670     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2671     wsrc = wdst;
2672     vlen_enc = Assembler::AVX_128bit;
2673   }
2674   if (is_dst_valid) {
2675     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2676   }
2677 }
2678 
2679 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2680   switch (bt) {
2681     case T_BYTE:  pextrb(dst, src, idx); break;
2682     case T_SHORT: pextrw(dst, src, idx); break;
2683     case T_INT:   pextrd(dst, src, idx); break;
2684     case T_LONG:  pextrq(dst, src, idx); break;
2685 
2686     default:
2687       assert(false,"Should not reach here.");
2688       break;
2689   }
2690 }
2691 
2692 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2693   int esize =  type2aelembytes(typ);
2694   int elem_per_lane = 16/esize;
2695   int lane = elemindex / elem_per_lane;
2696   int eindex = elemindex % elem_per_lane;
2697 
2698   if (lane >= 2) {
2699     assert(UseAVX > 2, "required");
2700     vextractf32x4(dst, src, lane & 3);
2701     return dst;
2702   } else if (lane > 0) {
2703     assert(UseAVX > 0, "required");
2704     vextractf128(dst, src, lane);
2705     return dst;
2706   } else {
2707     return src;
2708   }
2709 }
2710 
2711 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2712   if (typ == T_BYTE) {
2713     movsbl(dst, dst);
2714   } else if (typ == T_SHORT) {
2715     movswl(dst, dst);
2716   }
2717 }
2718 
2719 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2720   int esize =  type2aelembytes(typ);
2721   int elem_per_lane = 16/esize;
2722   int eindex = elemindex % elem_per_lane;
2723   assert(is_integral_type(typ),"required");
2724 
2725   if (eindex == 0) {
2726     if (typ == T_LONG) {
2727       movq(dst, src);
2728     } else {
2729       movdl(dst, src);
2730       movsxl(typ, dst);
2731     }
2732   } else {
2733     extract(typ, dst, src, eindex);
2734     movsxl(typ, dst);
2735   }
2736 }
2737 
2738 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2739   int esize =  type2aelembytes(typ);
2740   int elem_per_lane = 16/esize;
2741   int eindex = elemindex % elem_per_lane;
2742   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2743 
2744   if (eindex == 0) {
2745     movq(dst, src);
2746   } else {
2747     if (typ == T_FLOAT) {
2748       if (UseAVX == 0) {
2749         movdqu(dst, src);
2750         shufps(dst, dst, eindex);
2751       } else {
2752         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2753       }
2754     } else {
2755       if (UseAVX == 0) {
2756         movdqu(dst, src);
2757         psrldq(dst, eindex*esize);
2758       } else {
2759         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2760       }
2761       movq(dst, dst);
2762     }
2763   }
2764   // Zero upper bits
2765   if (typ == T_FLOAT) {
2766     if (UseAVX == 0) {
2767       assert(vtmp != xnoreg, "required.");
2768       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2769       pand(dst, vtmp);
2770     } else {
2771       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2772     }
2773   }
2774 }
2775 
2776 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2777   switch(typ) {
2778     case T_BYTE:
2779     case T_BOOLEAN:
2780       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2781       break;
2782     case T_SHORT:
2783     case T_CHAR:
2784       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2785       break;
2786     case T_INT:
2787     case T_FLOAT:
2788       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2789       break;
2790     case T_LONG:
2791     case T_DOUBLE:
2792       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2793       break;
2794     default:
2795       assert(false,"Should not reach here.");
2796       break;
2797   }
2798 }
2799 
2800 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2801   assert(rscratch != noreg || always_reachable(src2), "missing");
2802 
2803   switch(typ) {
2804     case T_BOOLEAN:
2805     case T_BYTE:
2806       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2807       break;
2808     case T_CHAR:
2809     case T_SHORT:
2810       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2811       break;
2812     case T_INT:
2813     case T_FLOAT:
2814       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2815       break;
2816     case T_LONG:
2817     case T_DOUBLE:
2818       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2819       break;
2820     default:
2821       assert(false,"Should not reach here.");
2822       break;
2823   }
2824 }
2825 
2826 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2827   switch(typ) {
2828     case T_BYTE:
2829       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2830       break;
2831     case T_SHORT:
2832       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2833       break;
2834     case T_INT:
2835     case T_FLOAT:
2836       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2837       break;
2838     case T_LONG:
2839     case T_DOUBLE:
2840       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2841       break;
2842     default:
2843       assert(false,"Should not reach here.");
2844       break;
2845   }
2846 }
2847 
2848 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2849   assert(vlen_in_bytes <= 32, "");
2850   int esize = type2aelembytes(bt);
2851   if (vlen_in_bytes == 32) {
2852     assert(vtmp == xnoreg, "required.");
2853     if (esize >= 4) {
2854       vtestps(src1, src2, AVX_256bit);
2855     } else {
2856       vptest(src1, src2, AVX_256bit);
2857     }
2858     return;
2859   }
2860   if (vlen_in_bytes < 16) {
2861     // Duplicate the lower part to fill the whole register,
2862     // Don't need to do so for src2
2863     assert(vtmp != xnoreg, "required");
2864     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2865     pshufd(vtmp, src1, shuffle_imm);
2866   } else {
2867     assert(vtmp == xnoreg, "required");
2868     vtmp = src1;
2869   }
2870   if (esize >= 4 && VM_Version::supports_avx()) {
2871     vtestps(vtmp, src2, AVX_128bit);
2872   } else {
2873     ptest(vtmp, src2);
2874   }
2875 }
2876 
2877 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2878   assert(UseAVX >= 2, "required");
2879 #ifdef ASSERT
2880   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2881   bool is_bw_supported = VM_Version::supports_avx512bw();
2882   if (is_bw && !is_bw_supported) {
2883     assert(vlen_enc != Assembler::AVX_512bit, "required");
2884     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2885            "XMM register should be 0-15");
2886   }
2887 #endif // ASSERT
2888   switch (elem_bt) {
2889     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2890     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2891     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2892     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2893     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2894     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2895     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2896   }
2897 }
2898 
2899 #ifdef _LP64
2900 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2901   assert(UseAVX >= 2, "required");
2902   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2903   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2904   if ((UseAVX > 2) &&
2905       (!is_bw || VM_Version::supports_avx512bw()) &&
2906       (!is_vl || VM_Version::supports_avx512vl())) {
2907     switch (elem_bt) {
2908       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2909       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2910       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2911       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2912       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2913     }
2914   } else {
2915     assert(vlen_enc != Assembler::AVX_512bit, "required");
2916     assert((dst->encoding() < 16),"XMM register should be 0-15");
2917     switch (elem_bt) {
2918       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2919       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2920       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2921       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2922       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2923       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2924       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2925     }
2926   }
2927 }
2928 #endif
2929 
2930 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2931   switch (to_elem_bt) {
2932     case T_SHORT:
2933       vpmovsxbw(dst, src, vlen_enc);
2934       break;
2935     case T_INT:
2936       vpmovsxbd(dst, src, vlen_enc);
2937       break;
2938     case T_FLOAT:
2939       vpmovsxbd(dst, src, vlen_enc);
2940       vcvtdq2ps(dst, dst, vlen_enc);
2941       break;
2942     case T_LONG:
2943       vpmovsxbq(dst, src, vlen_enc);
2944       break;
2945     case T_DOUBLE: {
2946       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2947       vpmovsxbd(dst, src, mid_vlen_enc);
2948       vcvtdq2pd(dst, dst, vlen_enc);
2949       break;
2950     }
2951     default:
2952       fatal("Unsupported type %s", type2name(to_elem_bt));
2953       break;
2954   }
2955 }
2956 
2957 //-------------------------------------------------------------------------------------------
2958 
2959 // IndexOf for constant substrings with size >= 8 chars
2960 // which don't need to be loaded through stack.
2961 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2962                                          Register cnt1, Register cnt2,
2963                                          int int_cnt2,  Register result,
2964                                          XMMRegister vec, Register tmp,
2965                                          int ae) {
2966   ShortBranchVerifier sbv(this);
2967   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2968   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2969 
2970   // This method uses the pcmpestri instruction with bound registers
2971   //   inputs:
2972   //     xmm - substring
2973   //     rax - substring length (elements count)
2974   //     mem - scanned string
2975   //     rdx - string length (elements count)
2976   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2977   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2978   //   outputs:
2979   //     rcx - matched index in string
2980   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2981   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2982   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2983   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2984   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2985 
2986   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2987         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2988         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2989 
2990   // Note, inline_string_indexOf() generates checks:
2991   // if (substr.count > string.count) return -1;
2992   // if (substr.count == 0) return 0;
2993   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2994 
2995   // Load substring.
2996   if (ae == StrIntrinsicNode::UL) {
2997     pmovzxbw(vec, Address(str2, 0));
2998   } else {
2999     movdqu(vec, Address(str2, 0));
3000   }
3001   movl(cnt2, int_cnt2);
3002   movptr(result, str1); // string addr
3003 
3004   if (int_cnt2 > stride) {
3005     jmpb(SCAN_TO_SUBSTR);
3006 
3007     // Reload substr for rescan, this code
3008     // is executed only for large substrings (> 8 chars)
3009     bind(RELOAD_SUBSTR);
3010     if (ae == StrIntrinsicNode::UL) {
3011       pmovzxbw(vec, Address(str2, 0));
3012     } else {
3013       movdqu(vec, Address(str2, 0));
3014     }
3015     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
3016 
3017     bind(RELOAD_STR);
3018     // We came here after the beginning of the substring was
3019     // matched but the rest of it was not so we need to search
3020     // again. Start from the next element after the previous match.
3021 
3022     // cnt2 is number of substring reminding elements and
3023     // cnt1 is number of string reminding elements when cmp failed.
3024     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
3025     subl(cnt1, cnt2);
3026     addl(cnt1, int_cnt2);
3027     movl(cnt2, int_cnt2); // Now restore cnt2
3028 
3029     decrementl(cnt1);     // Shift to next element
3030     cmpl(cnt1, cnt2);
3031     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3032 
3033     addptr(result, (1<<scale1));
3034 
3035   } // (int_cnt2 > 8)
3036 
3037   // Scan string for start of substr in 16-byte vectors
3038   bind(SCAN_TO_SUBSTR);
3039   pcmpestri(vec, Address(result, 0), mode);
3040   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3041   subl(cnt1, stride);
3042   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3043   cmpl(cnt1, cnt2);
3044   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3045   addptr(result, 16);
3046   jmpb(SCAN_TO_SUBSTR);
3047 
3048   // Found a potential substr
3049   bind(FOUND_CANDIDATE);
3050   // Matched whole vector if first element matched (tmp(rcx) == 0).
3051   if (int_cnt2 == stride) {
3052     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
3053   } else { // int_cnt2 > 8
3054     jccb(Assembler::overflow, FOUND_SUBSTR);
3055   }
3056   // After pcmpestri tmp(rcx) contains matched element index
3057   // Compute start addr of substr
3058   lea(result, Address(result, tmp, scale1));
3059 
3060   // Make sure string is still long enough
3061   subl(cnt1, tmp);
3062   cmpl(cnt1, cnt2);
3063   if (int_cnt2 == stride) {
3064     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3065   } else { // int_cnt2 > 8
3066     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
3067   }
3068   // Left less then substring.
3069 
3070   bind(RET_NOT_FOUND);
3071   movl(result, -1);
3072   jmp(EXIT);
3073 
3074   if (int_cnt2 > stride) {
3075     // This code is optimized for the case when whole substring
3076     // is matched if its head is matched.
3077     bind(MATCH_SUBSTR_HEAD);
3078     pcmpestri(vec, Address(result, 0), mode);
3079     // Reload only string if does not match
3080     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
3081 
3082     Label CONT_SCAN_SUBSTR;
3083     // Compare the rest of substring (> 8 chars).
3084     bind(FOUND_SUBSTR);
3085     // First 8 chars are already matched.
3086     negptr(cnt2);
3087     addptr(cnt2, stride);
3088 
3089     bind(SCAN_SUBSTR);
3090     subl(cnt1, stride);
3091     cmpl(cnt2, -stride); // Do not read beyond substring
3092     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
3093     // Back-up strings to avoid reading beyond substring:
3094     // cnt1 = cnt1 - cnt2 + 8
3095     addl(cnt1, cnt2); // cnt2 is negative
3096     addl(cnt1, stride);
3097     movl(cnt2, stride); negptr(cnt2);
3098     bind(CONT_SCAN_SUBSTR);
3099     if (int_cnt2 < (int)G) {
3100       int tail_off1 = int_cnt2<<scale1;
3101       int tail_off2 = int_cnt2<<scale2;
3102       if (ae == StrIntrinsicNode::UL) {
3103         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
3104       } else {
3105         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
3106       }
3107       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
3108     } else {
3109       // calculate index in register to avoid integer overflow (int_cnt2*2)
3110       movl(tmp, int_cnt2);
3111       addptr(tmp, cnt2);
3112       if (ae == StrIntrinsicNode::UL) {
3113         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
3114       } else {
3115         movdqu(vec, Address(str2, tmp, scale2, 0));
3116       }
3117       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
3118     }
3119     // Need to reload strings pointers if not matched whole vector
3120     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3121     addptr(cnt2, stride);
3122     jcc(Assembler::negative, SCAN_SUBSTR);
3123     // Fall through if found full substring
3124 
3125   } // (int_cnt2 > 8)
3126 
3127   bind(RET_FOUND);
3128   // Found result if we matched full small substring.
3129   // Compute substr offset
3130   subptr(result, str1);
3131   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3132     shrl(result, 1); // index
3133   }
3134   bind(EXIT);
3135 
3136 } // string_indexofC8
3137 
3138 // Small strings are loaded through stack if they cross page boundary.
3139 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
3140                                        Register cnt1, Register cnt2,
3141                                        int int_cnt2,  Register result,
3142                                        XMMRegister vec, Register tmp,
3143                                        int ae) {
3144   ShortBranchVerifier sbv(this);
3145   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3146   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
3147 
3148   //
3149   // int_cnt2 is length of small (< 8 chars) constant substring
3150   // or (-1) for non constant substring in which case its length
3151   // is in cnt2 register.
3152   //
3153   // Note, inline_string_indexOf() generates checks:
3154   // if (substr.count > string.count) return -1;
3155   // if (substr.count == 0) return 0;
3156   //
3157   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
3158   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
3159   // This method uses the pcmpestri instruction with bound registers
3160   //   inputs:
3161   //     xmm - substring
3162   //     rax - substring length (elements count)
3163   //     mem - scanned string
3164   //     rdx - string length (elements count)
3165   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
3166   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
3167   //   outputs:
3168   //     rcx - matched index in string
3169   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3170   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
3171   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
3172   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
3173 
3174   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
3175         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
3176         FOUND_CANDIDATE;
3177 
3178   { //========================================================
3179     // We don't know where these strings are located
3180     // and we can't read beyond them. Load them through stack.
3181     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
3182 
3183     movptr(tmp, rsp); // save old SP
3184 
3185     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
3186       if (int_cnt2 == (1>>scale2)) { // One byte
3187         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
3188         load_unsigned_byte(result, Address(str2, 0));
3189         movdl(vec, result); // move 32 bits
3190       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
3191         // Not enough header space in 32-bit VM: 12+3 = 15.
3192         movl(result, Address(str2, -1));
3193         shrl(result, 8);
3194         movdl(vec, result); // move 32 bits
3195       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
3196         load_unsigned_short(result, Address(str2, 0));
3197         movdl(vec, result); // move 32 bits
3198       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
3199         movdl(vec, Address(str2, 0)); // move 32 bits
3200       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
3201         movq(vec, Address(str2, 0));  // move 64 bits
3202       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3203         // Array header size is 12 bytes in 32-bit VM
3204         // + 6 bytes for 3 chars == 18 bytes,
3205         // enough space to load vec and shift.
3206         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3207         if (ae == StrIntrinsicNode::UL) {
3208           int tail_off = int_cnt2-8;
3209           pmovzxbw(vec, Address(str2, tail_off));
3210           psrldq(vec, -2*tail_off);
3211         }
3212         else {
3213           int tail_off = int_cnt2*(1<<scale2);
3214           movdqu(vec, Address(str2, tail_off-16));
3215           psrldq(vec, 16-tail_off);
3216         }
3217       }
3218     } else { // not constant substring
3219       cmpl(cnt2, stride);
3220       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3221 
3222       // We can read beyond string if srt+16 does not cross page boundary
3223       // since heaps are aligned and mapped by pages.
3224       assert(os::vm_page_size() < (int)G, "default page should be small");
3225       movl(result, str2); // We need only low 32 bits
3226       andl(result, ((int)os::vm_page_size()-1));
3227       cmpl(result, ((int)os::vm_page_size()-16));
3228       jccb(Assembler::belowEqual, CHECK_STR);
3229 
3230       // Move small strings to stack to allow load 16 bytes into vec.
3231       subptr(rsp, 16);
3232       int stk_offset = wordSize-(1<<scale2);
3233       push(cnt2);
3234 
3235       bind(COPY_SUBSTR);
3236       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3237         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3238         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3239       } else if (ae == StrIntrinsicNode::UU) {
3240         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3241         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3242       }
3243       decrement(cnt2);
3244       jccb(Assembler::notZero, COPY_SUBSTR);
3245 
3246       pop(cnt2);
3247       movptr(str2, rsp);  // New substring address
3248     } // non constant
3249 
3250     bind(CHECK_STR);
3251     cmpl(cnt1, stride);
3252     jccb(Assembler::aboveEqual, BIG_STRINGS);
3253 
3254     // Check cross page boundary.
3255     movl(result, str1); // We need only low 32 bits
3256     andl(result, ((int)os::vm_page_size()-1));
3257     cmpl(result, ((int)os::vm_page_size()-16));
3258     jccb(Assembler::belowEqual, BIG_STRINGS);
3259 
3260     subptr(rsp, 16);
3261     int stk_offset = -(1<<scale1);
3262     if (int_cnt2 < 0) { // not constant
3263       push(cnt2);
3264       stk_offset += wordSize;
3265     }
3266     movl(cnt2, cnt1);
3267 
3268     bind(COPY_STR);
3269     if (ae == StrIntrinsicNode::LL) {
3270       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3271       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3272     } else {
3273       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3274       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3275     }
3276     decrement(cnt2);
3277     jccb(Assembler::notZero, COPY_STR);
3278 
3279     if (int_cnt2 < 0) { // not constant
3280       pop(cnt2);
3281     }
3282     movptr(str1, rsp);  // New string address
3283 
3284     bind(BIG_STRINGS);
3285     // Load substring.
3286     if (int_cnt2 < 0) { // -1
3287       if (ae == StrIntrinsicNode::UL) {
3288         pmovzxbw(vec, Address(str2, 0));
3289       } else {
3290         movdqu(vec, Address(str2, 0));
3291       }
3292       push(cnt2);       // substr count
3293       push(str2);       // substr addr
3294       push(str1);       // string addr
3295     } else {
3296       // Small (< 8 chars) constant substrings are loaded already.
3297       movl(cnt2, int_cnt2);
3298     }
3299     push(tmp);  // original SP
3300 
3301   } // Finished loading
3302 
3303   //========================================================
3304   // Start search
3305   //
3306 
3307   movptr(result, str1); // string addr
3308 
3309   if (int_cnt2  < 0) {  // Only for non constant substring
3310     jmpb(SCAN_TO_SUBSTR);
3311 
3312     // SP saved at sp+0
3313     // String saved at sp+1*wordSize
3314     // Substr saved at sp+2*wordSize
3315     // Substr count saved at sp+3*wordSize
3316 
3317     // Reload substr for rescan, this code
3318     // is executed only for large substrings (> 8 chars)
3319     bind(RELOAD_SUBSTR);
3320     movptr(str2, Address(rsp, 2*wordSize));
3321     movl(cnt2, Address(rsp, 3*wordSize));
3322     if (ae == StrIntrinsicNode::UL) {
3323       pmovzxbw(vec, Address(str2, 0));
3324     } else {
3325       movdqu(vec, Address(str2, 0));
3326     }
3327     // We came here after the beginning of the substring was
3328     // matched but the rest of it was not so we need to search
3329     // again. Start from the next element after the previous match.
3330     subptr(str1, result); // Restore counter
3331     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3332       shrl(str1, 1);
3333     }
3334     addl(cnt1, str1);
3335     decrementl(cnt1);   // Shift to next element
3336     cmpl(cnt1, cnt2);
3337     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3338 
3339     addptr(result, (1<<scale1));
3340   } // non constant
3341 
3342   // Scan string for start of substr in 16-byte vectors
3343   bind(SCAN_TO_SUBSTR);
3344   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3345   pcmpestri(vec, Address(result, 0), mode);
3346   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3347   subl(cnt1, stride);
3348   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3349   cmpl(cnt1, cnt2);
3350   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3351   addptr(result, 16);
3352 
3353   bind(ADJUST_STR);
3354   cmpl(cnt1, stride); // Do not read beyond string
3355   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3356   // Back-up string to avoid reading beyond string.
3357   lea(result, Address(result, cnt1, scale1, -16));
3358   movl(cnt1, stride);
3359   jmpb(SCAN_TO_SUBSTR);
3360 
3361   // Found a potential substr
3362   bind(FOUND_CANDIDATE);
3363   // After pcmpestri tmp(rcx) contains matched element index
3364 
3365   // Make sure string is still long enough
3366   subl(cnt1, tmp);
3367   cmpl(cnt1, cnt2);
3368   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3369   // Left less then substring.
3370 
3371   bind(RET_NOT_FOUND);
3372   movl(result, -1);
3373   jmp(CLEANUP);
3374 
3375   bind(FOUND_SUBSTR);
3376   // Compute start addr of substr
3377   lea(result, Address(result, tmp, scale1));
3378   if (int_cnt2 > 0) { // Constant substring
3379     // Repeat search for small substring (< 8 chars)
3380     // from new point without reloading substring.
3381     // Have to check that we don't read beyond string.
3382     cmpl(tmp, stride-int_cnt2);
3383     jccb(Assembler::greater, ADJUST_STR);
3384     // Fall through if matched whole substring.
3385   } else { // non constant
3386     assert(int_cnt2 == -1, "should be != 0");
3387 
3388     addl(tmp, cnt2);
3389     // Found result if we matched whole substring.
3390     cmpl(tmp, stride);
3391     jcc(Assembler::lessEqual, RET_FOUND);
3392 
3393     // Repeat search for small substring (<= 8 chars)
3394     // from new point 'str1' without reloading substring.
3395     cmpl(cnt2, stride);
3396     // Have to check that we don't read beyond string.
3397     jccb(Assembler::lessEqual, ADJUST_STR);
3398 
3399     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3400     // Compare the rest of substring (> 8 chars).
3401     movptr(str1, result);
3402 
3403     cmpl(tmp, cnt2);
3404     // First 8 chars are already matched.
3405     jccb(Assembler::equal, CHECK_NEXT);
3406 
3407     bind(SCAN_SUBSTR);
3408     pcmpestri(vec, Address(str1, 0), mode);
3409     // Need to reload strings pointers if not matched whole vector
3410     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3411 
3412     bind(CHECK_NEXT);
3413     subl(cnt2, stride);
3414     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3415     addptr(str1, 16);
3416     if (ae == StrIntrinsicNode::UL) {
3417       addptr(str2, 8);
3418     } else {
3419       addptr(str2, 16);
3420     }
3421     subl(cnt1, stride);
3422     cmpl(cnt2, stride); // Do not read beyond substring
3423     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3424     // Back-up strings to avoid reading beyond substring.
3425 
3426     if (ae == StrIntrinsicNode::UL) {
3427       lea(str2, Address(str2, cnt2, scale2, -8));
3428       lea(str1, Address(str1, cnt2, scale1, -16));
3429     } else {
3430       lea(str2, Address(str2, cnt2, scale2, -16));
3431       lea(str1, Address(str1, cnt2, scale1, -16));
3432     }
3433     subl(cnt1, cnt2);
3434     movl(cnt2, stride);
3435     addl(cnt1, stride);
3436     bind(CONT_SCAN_SUBSTR);
3437     if (ae == StrIntrinsicNode::UL) {
3438       pmovzxbw(vec, Address(str2, 0));
3439     } else {
3440       movdqu(vec, Address(str2, 0));
3441     }
3442     jmp(SCAN_SUBSTR);
3443 
3444     bind(RET_FOUND_LONG);
3445     movptr(str1, Address(rsp, wordSize));
3446   } // non constant
3447 
3448   bind(RET_FOUND);
3449   // Compute substr offset
3450   subptr(result, str1);
3451   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3452     shrl(result, 1); // index
3453   }
3454   bind(CLEANUP);
3455   pop(rsp); // restore SP
3456 
3457 } // string_indexof
3458 
3459 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3460                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3461   ShortBranchVerifier sbv(this);
3462   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3463 
3464   int stride = 8;
3465 
3466   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3467         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3468         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3469         FOUND_SEQ_CHAR, DONE_LABEL;
3470 
3471   movptr(result, str1);
3472   if (UseAVX >= 2) {
3473     cmpl(cnt1, stride);
3474     jcc(Assembler::less, SCAN_TO_CHAR);
3475     cmpl(cnt1, 2*stride);
3476     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3477     movdl(vec1, ch);
3478     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3479     vpxor(vec2, vec2);
3480     movl(tmp, cnt1);
3481     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3482     andl(cnt1,0x0000000F);  //tail count (in chars)
3483 
3484     bind(SCAN_TO_16_CHAR_LOOP);
3485     vmovdqu(vec3, Address(result, 0));
3486     vpcmpeqw(vec3, vec3, vec1, 1);
3487     vptest(vec2, vec3);
3488     jcc(Assembler::carryClear, FOUND_CHAR);
3489     addptr(result, 32);
3490     subl(tmp, 2*stride);
3491     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3492     jmp(SCAN_TO_8_CHAR);
3493     bind(SCAN_TO_8_CHAR_INIT);
3494     movdl(vec1, ch);
3495     pshuflw(vec1, vec1, 0x00);
3496     pshufd(vec1, vec1, 0);
3497     pxor(vec2, vec2);
3498   }
3499   bind(SCAN_TO_8_CHAR);
3500   cmpl(cnt1, stride);
3501   jcc(Assembler::less, SCAN_TO_CHAR);
3502   if (UseAVX < 2) {
3503     movdl(vec1, ch);
3504     pshuflw(vec1, vec1, 0x00);
3505     pshufd(vec1, vec1, 0);
3506     pxor(vec2, vec2);
3507   }
3508   movl(tmp, cnt1);
3509   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3510   andl(cnt1,0x00000007);  //tail count (in chars)
3511 
3512   bind(SCAN_TO_8_CHAR_LOOP);
3513   movdqu(vec3, Address(result, 0));
3514   pcmpeqw(vec3, vec1);
3515   ptest(vec2, vec3);
3516   jcc(Assembler::carryClear, FOUND_CHAR);
3517   addptr(result, 16);
3518   subl(tmp, stride);
3519   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3520   bind(SCAN_TO_CHAR);
3521   testl(cnt1, cnt1);
3522   jcc(Assembler::zero, RET_NOT_FOUND);
3523   bind(SCAN_TO_CHAR_LOOP);
3524   load_unsigned_short(tmp, Address(result, 0));
3525   cmpl(ch, tmp);
3526   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3527   addptr(result, 2);
3528   subl(cnt1, 1);
3529   jccb(Assembler::zero, RET_NOT_FOUND);
3530   jmp(SCAN_TO_CHAR_LOOP);
3531 
3532   bind(RET_NOT_FOUND);
3533   movl(result, -1);
3534   jmpb(DONE_LABEL);
3535 
3536   bind(FOUND_CHAR);
3537   if (UseAVX >= 2) {
3538     vpmovmskb(tmp, vec3);
3539   } else {
3540     pmovmskb(tmp, vec3);
3541   }
3542   bsfl(ch, tmp);
3543   addptr(result, ch);
3544 
3545   bind(FOUND_SEQ_CHAR);
3546   subptr(result, str1);
3547   shrl(result, 1);
3548 
3549   bind(DONE_LABEL);
3550 } // string_indexof_char
3551 
3552 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3553                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3554   ShortBranchVerifier sbv(this);
3555   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3556 
3557   int stride = 16;
3558 
3559   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3560         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3561         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3562         FOUND_SEQ_CHAR, DONE_LABEL;
3563 
3564   movptr(result, str1);
3565   if (UseAVX >= 2) {
3566     cmpl(cnt1, stride);
3567     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3568     cmpl(cnt1, stride*2);
3569     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3570     movdl(vec1, ch);
3571     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3572     vpxor(vec2, vec2);
3573     movl(tmp, cnt1);
3574     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3575     andl(cnt1,0x0000001F);  //tail count (in chars)
3576 
3577     bind(SCAN_TO_32_CHAR_LOOP);
3578     vmovdqu(vec3, Address(result, 0));
3579     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3580     vptest(vec2, vec3);
3581     jcc(Assembler::carryClear, FOUND_CHAR);
3582     addptr(result, 32);
3583     subl(tmp, stride*2);
3584     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3585     jmp(SCAN_TO_16_CHAR);
3586 
3587     bind(SCAN_TO_16_CHAR_INIT);
3588     movdl(vec1, ch);
3589     pxor(vec2, vec2);
3590     pshufb(vec1, vec2);
3591   }
3592 
3593   bind(SCAN_TO_16_CHAR);
3594   cmpl(cnt1, stride);
3595   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3596   if (UseAVX < 2) {
3597     movdl(vec1, ch);
3598     pxor(vec2, vec2);
3599     pshufb(vec1, vec2);
3600   }
3601   movl(tmp, cnt1);
3602   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3603   andl(cnt1,0x0000000F);  //tail count (in bytes)
3604 
3605   bind(SCAN_TO_16_CHAR_LOOP);
3606   movdqu(vec3, Address(result, 0));
3607   pcmpeqb(vec3, vec1);
3608   ptest(vec2, vec3);
3609   jcc(Assembler::carryClear, FOUND_CHAR);
3610   addptr(result, 16);
3611   subl(tmp, stride);
3612   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3613 
3614   bind(SCAN_TO_CHAR_INIT);
3615   testl(cnt1, cnt1);
3616   jcc(Assembler::zero, RET_NOT_FOUND);
3617   bind(SCAN_TO_CHAR_LOOP);
3618   load_unsigned_byte(tmp, Address(result, 0));
3619   cmpl(ch, tmp);
3620   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3621   addptr(result, 1);
3622   subl(cnt1, 1);
3623   jccb(Assembler::zero, RET_NOT_FOUND);
3624   jmp(SCAN_TO_CHAR_LOOP);
3625 
3626   bind(RET_NOT_FOUND);
3627   movl(result, -1);
3628   jmpb(DONE_LABEL);
3629 
3630   bind(FOUND_CHAR);
3631   if (UseAVX >= 2) {
3632     vpmovmskb(tmp, vec3);
3633   } else {
3634     pmovmskb(tmp, vec3);
3635   }
3636   bsfl(ch, tmp);
3637   addptr(result, ch);
3638 
3639   bind(FOUND_SEQ_CHAR);
3640   subptr(result, str1);
3641 
3642   bind(DONE_LABEL);
3643 } // stringL_indexof_char
3644 
3645 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3646   switch (eltype) {
3647   case T_BOOLEAN: return sizeof(jboolean);
3648   case T_BYTE:  return sizeof(jbyte);
3649   case T_SHORT: return sizeof(jshort);
3650   case T_CHAR:  return sizeof(jchar);
3651   case T_INT:   return sizeof(jint);
3652   default:
3653     ShouldNotReachHere();
3654     return -1;
3655   }
3656 }
3657 
3658 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3659   switch (eltype) {
3660   // T_BOOLEAN used as surrogate for unsigned byte
3661   case T_BOOLEAN: movzbl(dst, src);   break;
3662   case T_BYTE:    movsbl(dst, src);   break;
3663   case T_SHORT:   movswl(dst, src);   break;
3664   case T_CHAR:    movzwl(dst, src);   break;
3665   case T_INT:     movl(dst, src);     break;
3666   default:
3667     ShouldNotReachHere();
3668   }
3669 }
3670 
3671 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3672   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3673 }
3674 
3675 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3676   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3677 }
3678 
3679 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3680   const int vlen = Assembler::AVX_256bit;
3681   switch (eltype) {
3682   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3683   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3684   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3685   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3686   case T_INT:
3687     // do nothing
3688     break;
3689   default:
3690     ShouldNotReachHere();
3691   }
3692 }
3693 
3694 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3695                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3696                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3697                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3698                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3699                                         BasicType eltype) {
3700   ShortBranchVerifier sbv(this);
3701   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3702   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3703   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3704 
3705   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3706         SHORT_UNROLLED_LOOP_EXIT,
3707         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3708         UNROLLED_VECTOR_LOOP_BEGIN,
3709         END;
3710   switch (eltype) {
3711   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3712   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3713   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3714   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3715   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3716   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3717   }
3718 
3719   // For "renaming" for readibility of the code
3720   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3721                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3722                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3723 
3724   const int elsize = arrays_hashcode_elsize(eltype);
3725 
3726   /*
3727     if (cnt1 >= 2) {
3728       if (cnt1 >= 32) {
3729         UNROLLED VECTOR LOOP
3730       }
3731       UNROLLED SCALAR LOOP
3732     }
3733     SINGLE SCALAR
3734    */
3735 
3736   cmpl(cnt1, 32);
3737   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3738 
3739   // cnt1 >= 32 && generate_vectorized_loop
3740   xorl(index, index);
3741 
3742   // vresult = IntVector.zero(I256);
3743   for (int idx = 0; idx < 4; idx++) {
3744     vpxor(vresult[idx], vresult[idx]);
3745   }
3746   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3747   Register bound = tmp2;
3748   Register next = tmp3;
3749   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3750   movl(next, Address(tmp2, 0));
3751   movdl(vnext, next);
3752   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3753 
3754   // index = 0;
3755   // bound = cnt1 & ~(32 - 1);
3756   movl(bound, cnt1);
3757   andl(bound, ~(32 - 1));
3758   // for (; index < bound; index += 32) {
3759   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3760   // result *= next;
3761   imull(result, next);
3762   // loop fission to upfront the cost of fetching from memory, OOO execution
3763   // can then hopefully do a better job of prefetching
3764   for (int idx = 0; idx < 4; idx++) {
3765     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3766   }
3767   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3768   for (int idx = 0; idx < 4; idx++) {
3769     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3770     arrays_hashcode_elvcast(vtmp[idx], eltype);
3771     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3772   }
3773   // index += 32;
3774   addl(index, 32);
3775   // index < bound;
3776   cmpl(index, bound);
3777   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3778   // }
3779 
3780   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3781   subl(cnt1, bound);
3782   // release bound
3783 
3784   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3785   for (int idx = 0; idx < 4; idx++) {
3786     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3787     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3788     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3789   }
3790   // result += vresult.reduceLanes(ADD);
3791   for (int idx = 0; idx < 4; idx++) {
3792     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3793   }
3794 
3795   // } else if (cnt1 < 32) {
3796 
3797   bind(SHORT_UNROLLED_BEGIN);
3798   // int i = 1;
3799   movl(index, 1);
3800   cmpl(index, cnt1);
3801   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3802 
3803   // for (; i < cnt1 ; i += 2) {
3804   bind(SHORT_UNROLLED_LOOP_BEGIN);
3805   movl(tmp3, 961);
3806   imull(result, tmp3);
3807   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3808   movl(tmp3, tmp2);
3809   shll(tmp3, 5);
3810   subl(tmp3, tmp2);
3811   addl(result, tmp3);
3812   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3813   addl(result, tmp3);
3814   addl(index, 2);
3815   cmpl(index, cnt1);
3816   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3817 
3818   // }
3819   // if (i >= cnt1) {
3820   bind(SHORT_UNROLLED_LOOP_EXIT);
3821   jccb(Assembler::greater, END);
3822   movl(tmp2, result);
3823   shll(result, 5);
3824   subl(result, tmp2);
3825   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3826   addl(result, tmp3);
3827   // }
3828   bind(END);
3829 
3830   BLOCK_COMMENT("} // arrays_hashcode");
3831 
3832 } // arrays_hashcode
3833 
3834 // helper function for string_compare
3835 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3836                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3837                                            Address::ScaleFactor scale2, Register index, int ae) {
3838   if (ae == StrIntrinsicNode::LL) {
3839     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3840     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3841   } else if (ae == StrIntrinsicNode::UU) {
3842     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3843     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3844   } else {
3845     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3846     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3847   }
3848 }
3849 
3850 // Compare strings, used for char[] and byte[].
3851 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3852                                        Register cnt1, Register cnt2, Register result,
3853                                        XMMRegister vec1, int ae, KRegister mask) {
3854   ShortBranchVerifier sbv(this);
3855   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3856   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3857   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3858   int stride2x2 = 0x40;
3859   Address::ScaleFactor scale = Address::no_scale;
3860   Address::ScaleFactor scale1 = Address::no_scale;
3861   Address::ScaleFactor scale2 = Address::no_scale;
3862 
3863   if (ae != StrIntrinsicNode::LL) {
3864     stride2x2 = 0x20;
3865   }
3866 
3867   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3868     shrl(cnt2, 1);
3869   }
3870   // Compute the minimum of the string lengths and the
3871   // difference of the string lengths (stack).
3872   // Do the conditional move stuff
3873   movl(result, cnt1);
3874   subl(cnt1, cnt2);
3875   push(cnt1);
3876   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3877 
3878   // Is the minimum length zero?
3879   testl(cnt2, cnt2);
3880   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3881   if (ae == StrIntrinsicNode::LL) {
3882     // Load first bytes
3883     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3884     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3885   } else if (ae == StrIntrinsicNode::UU) {
3886     // Load first characters
3887     load_unsigned_short(result, Address(str1, 0));
3888     load_unsigned_short(cnt1, Address(str2, 0));
3889   } else {
3890     load_unsigned_byte(result, Address(str1, 0));
3891     load_unsigned_short(cnt1, Address(str2, 0));
3892   }
3893   subl(result, cnt1);
3894   jcc(Assembler::notZero,  POP_LABEL);
3895 
3896   if (ae == StrIntrinsicNode::UU) {
3897     // Divide length by 2 to get number of chars
3898     shrl(cnt2, 1);
3899   }
3900   cmpl(cnt2, 1);
3901   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3902 
3903   // Check if the strings start at the same location and setup scale and stride
3904   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3905     cmpptr(str1, str2);
3906     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3907     if (ae == StrIntrinsicNode::LL) {
3908       scale = Address::times_1;
3909       stride = 16;
3910     } else {
3911       scale = Address::times_2;
3912       stride = 8;
3913     }
3914   } else {
3915     scale1 = Address::times_1;
3916     scale2 = Address::times_2;
3917     // scale not used
3918     stride = 8;
3919   }
3920 
3921   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3922     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3923     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3924     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3925     Label COMPARE_TAIL_LONG;
3926     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3927 
3928     int pcmpmask = 0x19;
3929     if (ae == StrIntrinsicNode::LL) {
3930       pcmpmask &= ~0x01;
3931     }
3932 
3933     // Setup to compare 16-chars (32-bytes) vectors,
3934     // start from first character again because it has aligned address.
3935     if (ae == StrIntrinsicNode::LL) {
3936       stride2 = 32;
3937     } else {
3938       stride2 = 16;
3939     }
3940     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3941       adr_stride = stride << scale;
3942     } else {
3943       adr_stride1 = 8;  //stride << scale1;
3944       adr_stride2 = 16; //stride << scale2;
3945     }
3946 
3947     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3948     // rax and rdx are used by pcmpestri as elements counters
3949     movl(result, cnt2);
3950     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3951     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3952 
3953     // fast path : compare first 2 8-char vectors.
3954     bind(COMPARE_16_CHARS);
3955     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3956       movdqu(vec1, Address(str1, 0));
3957     } else {
3958       pmovzxbw(vec1, Address(str1, 0));
3959     }
3960     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3961     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3962 
3963     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3964       movdqu(vec1, Address(str1, adr_stride));
3965       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3966     } else {
3967       pmovzxbw(vec1, Address(str1, adr_stride1));
3968       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3969     }
3970     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3971     addl(cnt1, stride);
3972 
3973     // Compare the characters at index in cnt1
3974     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3975     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3976     subl(result, cnt2);
3977     jmp(POP_LABEL);
3978 
3979     // Setup the registers to start vector comparison loop
3980     bind(COMPARE_WIDE_VECTORS);
3981     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3982       lea(str1, Address(str1, result, scale));
3983       lea(str2, Address(str2, result, scale));
3984     } else {
3985       lea(str1, Address(str1, result, scale1));
3986       lea(str2, Address(str2, result, scale2));
3987     }
3988     subl(result, stride2);
3989     subl(cnt2, stride2);
3990     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3991     negptr(result);
3992 
3993     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3994     bind(COMPARE_WIDE_VECTORS_LOOP);
3995 
3996 #ifdef _LP64
3997     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3998       cmpl(cnt2, stride2x2);
3999       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4000       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
4001       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
4002 
4003       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4004       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4005         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
4006         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
4007       } else {
4008         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
4009         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
4010       }
4011       kortestql(mask, mask);
4012       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
4013       addptr(result, stride2x2);  // update since we already compared at this addr
4014       subl(cnt2, stride2x2);      // and sub the size too
4015       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4016 
4017       vpxor(vec1, vec1);
4018       jmpb(COMPARE_WIDE_TAIL);
4019     }//if (VM_Version::supports_avx512vlbw())
4020 #endif // _LP64
4021 
4022 
4023     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4024     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4025       vmovdqu(vec1, Address(str1, result, scale));
4026       vpxor(vec1, Address(str2, result, scale));
4027     } else {
4028       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
4029       vpxor(vec1, Address(str2, result, scale2));
4030     }
4031     vptest(vec1, vec1);
4032     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
4033     addptr(result, stride2);
4034     subl(cnt2, stride2);
4035     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
4036     // clean upper bits of YMM registers
4037     vpxor(vec1, vec1);
4038 
4039     // compare wide vectors tail
4040     bind(COMPARE_WIDE_TAIL);
4041     testptr(result, result);
4042     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
4043 
4044     movl(result, stride2);
4045     movl(cnt2, result);
4046     negptr(result);
4047     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4048 
4049     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
4050     bind(VECTOR_NOT_EQUAL);
4051     // clean upper bits of YMM registers
4052     vpxor(vec1, vec1);
4053     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4054       lea(str1, Address(str1, result, scale));
4055       lea(str2, Address(str2, result, scale));
4056     } else {
4057       lea(str1, Address(str1, result, scale1));
4058       lea(str2, Address(str2, result, scale2));
4059     }
4060     jmp(COMPARE_16_CHARS);
4061 
4062     // Compare tail chars, length between 1 to 15 chars
4063     bind(COMPARE_TAIL_LONG);
4064     movl(cnt2, result);
4065     cmpl(cnt2, stride);
4066     jcc(Assembler::less, COMPARE_SMALL_STR);
4067 
4068     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4069       movdqu(vec1, Address(str1, 0));
4070     } else {
4071       pmovzxbw(vec1, Address(str1, 0));
4072     }
4073     pcmpestri(vec1, Address(str2, 0), pcmpmask);
4074     jcc(Assembler::below, COMPARE_INDEX_CHAR);
4075     subptr(cnt2, stride);
4076     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
4077     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4078       lea(str1, Address(str1, result, scale));
4079       lea(str2, Address(str2, result, scale));
4080     } else {
4081       lea(str1, Address(str1, result, scale1));
4082       lea(str2, Address(str2, result, scale2));
4083     }
4084     negptr(cnt2);
4085     jmpb(WHILE_HEAD_LABEL);
4086 
4087     bind(COMPARE_SMALL_STR);
4088   } else if (UseSSE42Intrinsics) {
4089     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
4090     int pcmpmask = 0x19;
4091     // Setup to compare 8-char (16-byte) vectors,
4092     // start from first character again because it has aligned address.
4093     movl(result, cnt2);
4094     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
4095     if (ae == StrIntrinsicNode::LL) {
4096       pcmpmask &= ~0x01;
4097     }
4098     jcc(Assembler::zero, COMPARE_TAIL);
4099     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4100       lea(str1, Address(str1, result, scale));
4101       lea(str2, Address(str2, result, scale));
4102     } else {
4103       lea(str1, Address(str1, result, scale1));
4104       lea(str2, Address(str2, result, scale2));
4105     }
4106     negptr(result);
4107 
4108     // pcmpestri
4109     //   inputs:
4110     //     vec1- substring
4111     //     rax - negative string length (elements count)
4112     //     mem - scanned string
4113     //     rdx - string length (elements count)
4114     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
4115     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
4116     //   outputs:
4117     //     rcx - first mismatched element index
4118     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
4119 
4120     bind(COMPARE_WIDE_VECTORS);
4121     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4122       movdqu(vec1, Address(str1, result, scale));
4123       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
4124     } else {
4125       pmovzxbw(vec1, Address(str1, result, scale1));
4126       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
4127     }
4128     // After pcmpestri cnt1(rcx) contains mismatched element index
4129 
4130     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
4131     addptr(result, stride);
4132     subptr(cnt2, stride);
4133     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4134 
4135     // compare wide vectors tail
4136     testptr(result, result);
4137     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
4138 
4139     movl(cnt2, stride);
4140     movl(result, stride);
4141     negptr(result);
4142     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4143       movdqu(vec1, Address(str1, result, scale));
4144       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
4145     } else {
4146       pmovzxbw(vec1, Address(str1, result, scale1));
4147       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
4148     }
4149     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
4150 
4151     // Mismatched characters in the vectors
4152     bind(VECTOR_NOT_EQUAL);
4153     addptr(cnt1, result);
4154     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
4155     subl(result, cnt2);
4156     jmpb(POP_LABEL);
4157 
4158     bind(COMPARE_TAIL); // limit is zero
4159     movl(cnt2, result);
4160     // Fallthru to tail compare
4161   }
4162   // Shift str2 and str1 to the end of the arrays, negate min
4163   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4164     lea(str1, Address(str1, cnt2, scale));
4165     lea(str2, Address(str2, cnt2, scale));
4166   } else {
4167     lea(str1, Address(str1, cnt2, scale1));
4168     lea(str2, Address(str2, cnt2, scale2));
4169   }
4170   decrementl(cnt2);  // first character was compared already
4171   negptr(cnt2);
4172 
4173   // Compare the rest of the elements
4174   bind(WHILE_HEAD_LABEL);
4175   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
4176   subl(result, cnt1);
4177   jccb(Assembler::notZero, POP_LABEL);
4178   increment(cnt2);
4179   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
4180 
4181   // Strings are equal up to min length.  Return the length difference.
4182   bind(LENGTH_DIFF_LABEL);
4183   pop(result);
4184   if (ae == StrIntrinsicNode::UU) {
4185     // Divide diff by 2 to get number of chars
4186     sarl(result, 1);
4187   }
4188   jmpb(DONE_LABEL);
4189 
4190 #ifdef _LP64
4191   if (VM_Version::supports_avx512vlbw()) {
4192 
4193     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
4194 
4195     kmovql(cnt1, mask);
4196     notq(cnt1);
4197     bsfq(cnt2, cnt1);
4198     if (ae != StrIntrinsicNode::LL) {
4199       // Divide diff by 2 to get number of chars
4200       sarl(cnt2, 1);
4201     }
4202     addq(result, cnt2);
4203     if (ae == StrIntrinsicNode::LL) {
4204       load_unsigned_byte(cnt1, Address(str2, result));
4205       load_unsigned_byte(result, Address(str1, result));
4206     } else if (ae == StrIntrinsicNode::UU) {
4207       load_unsigned_short(cnt1, Address(str2, result, scale));
4208       load_unsigned_short(result, Address(str1, result, scale));
4209     } else {
4210       load_unsigned_short(cnt1, Address(str2, result, scale2));
4211       load_unsigned_byte(result, Address(str1, result, scale1));
4212     }
4213     subl(result, cnt1);
4214     jmpb(POP_LABEL);
4215   }//if (VM_Version::supports_avx512vlbw())
4216 #endif // _LP64
4217 
4218   // Discard the stored length difference
4219   bind(POP_LABEL);
4220   pop(cnt1);
4221 
4222   // That's it
4223   bind(DONE_LABEL);
4224   if(ae == StrIntrinsicNode::UL) {
4225     negl(result);
4226   }
4227 
4228 }
4229 
4230 // Search for Non-ASCII character (Negative byte value) in a byte array,
4231 // return the index of the first such character, otherwise the length
4232 // of the array segment searched.
4233 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4234 //   @IntrinsicCandidate
4235 //   public static int countPositives(byte[] ba, int off, int len) {
4236 //     for (int i = off; i < off + len; i++) {
4237 //       if (ba[i] < 0) {
4238 //         return i - off;
4239 //       }
4240 //     }
4241 //     return len;
4242 //   }
4243 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4244   Register result, Register tmp1,
4245   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4246   // rsi: byte array
4247   // rcx: len
4248   // rax: result
4249   ShortBranchVerifier sbv(this);
4250   assert_different_registers(ary1, len, result, tmp1);
4251   assert_different_registers(vec1, vec2);
4252   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4253 
4254   movl(result, len); // copy
4255   // len == 0
4256   testl(len, len);
4257   jcc(Assembler::zero, DONE);
4258 
4259   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4260     VM_Version::supports_avx512vlbw() &&
4261     VM_Version::supports_bmi2()) {
4262 
4263     Label test_64_loop, test_tail, BREAK_LOOP;
4264     movl(tmp1, len);
4265     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4266 
4267     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4268     andl(len,  0xffffffc0); // vector count (in chars)
4269     jccb(Assembler::zero, test_tail);
4270 
4271     lea(ary1, Address(ary1, len, Address::times_1));
4272     negptr(len);
4273 
4274     bind(test_64_loop);
4275     // Check whether our 64 elements of size byte contain negatives
4276     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4277     kortestql(mask1, mask1);
4278     jcc(Assembler::notZero, BREAK_LOOP);
4279 
4280     addptr(len, 64);
4281     jccb(Assembler::notZero, test_64_loop);
4282 
4283     bind(test_tail);
4284     // bail out when there is nothing to be done
4285     testl(tmp1, -1);
4286     jcc(Assembler::zero, DONE);
4287 
4288 
4289     // check the tail for absense of negatives
4290     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4291 #ifdef _LP64
4292     {
4293       Register tmp3_aliased = len;
4294       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4295       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4296       notq(tmp3_aliased);
4297       kmovql(mask2, tmp3_aliased);
4298     }
4299 #else
4300     Label k_init;
4301     jmp(k_init);
4302 
4303     // We could not read 64-bits from a general purpose register thus we move
4304     // data required to compose 64 1's to the instruction stream
4305     // We emit 64 byte wide series of elements from 0..63 which later on would
4306     // be used as a compare targets with tail count contained in tmp1 register.
4307     // Result would be a k register having tmp1 consecutive number or 1
4308     // counting from least significant bit.
4309     address tmp = pc();
4310     emit_int64(0x0706050403020100);
4311     emit_int64(0x0F0E0D0C0B0A0908);
4312     emit_int64(0x1716151413121110);
4313     emit_int64(0x1F1E1D1C1B1A1918);
4314     emit_int64(0x2726252423222120);
4315     emit_int64(0x2F2E2D2C2B2A2928);
4316     emit_int64(0x3736353433323130);
4317     emit_int64(0x3F3E3D3C3B3A3938);
4318 
4319     bind(k_init);
4320     lea(len, InternalAddress(tmp));
4321     // create mask to test for negative byte inside a vector
4322     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
4323     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
4324 
4325 #endif
4326     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4327     ktestq(mask1, mask2);
4328     jcc(Assembler::zero, DONE);
4329 
4330     // do a full check for negative registers in the tail
4331     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4332                      // ary1 already pointing to the right place
4333     jmpb(TAIL_START);
4334 
4335     bind(BREAK_LOOP);
4336     // At least one byte in the last 64 byte block was negative.
4337     // Set up to look at the last 64 bytes as if they were a tail
4338     lea(ary1, Address(ary1, len, Address::times_1));
4339     addptr(result, len);
4340     // Ignore the very last byte: if all others are positive,
4341     // it must be negative, so we can skip right to the 2+1 byte
4342     // end comparison at this point
4343     orl(result, 63);
4344     movl(len, 63);
4345     // Fallthru to tail compare
4346   } else {
4347 
4348     if (UseAVX >= 2 && UseSSE >= 2) {
4349       // With AVX2, use 32-byte vector compare
4350       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4351 
4352       // Compare 32-byte vectors
4353       testl(len, 0xffffffe0);   // vector count (in bytes)
4354       jccb(Assembler::zero, TAIL_START);
4355 
4356       andl(len, 0xffffffe0);
4357       lea(ary1, Address(ary1, len, Address::times_1));
4358       negptr(len);
4359 
4360       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4361       movdl(vec2, tmp1);
4362       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4363 
4364       bind(COMPARE_WIDE_VECTORS);
4365       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4366       vptest(vec1, vec2);
4367       jccb(Assembler::notZero, BREAK_LOOP);
4368       addptr(len, 32);
4369       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4370 
4371       testl(result, 0x0000001f);   // any bytes remaining?
4372       jcc(Assembler::zero, DONE);
4373 
4374       // Quick test using the already prepared vector mask
4375       movl(len, result);
4376       andl(len, 0x0000001f);
4377       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4378       vptest(vec1, vec2);
4379       jcc(Assembler::zero, DONE);
4380       // There are zeros, jump to the tail to determine exactly where
4381       jmpb(TAIL_START);
4382 
4383       bind(BREAK_LOOP);
4384       // At least one byte in the last 32-byte vector is negative.
4385       // Set up to look at the last 32 bytes as if they were a tail
4386       lea(ary1, Address(ary1, len, Address::times_1));
4387       addptr(result, len);
4388       // Ignore the very last byte: if all others are positive,
4389       // it must be negative, so we can skip right to the 2+1 byte
4390       // end comparison at this point
4391       orl(result, 31);
4392       movl(len, 31);
4393       // Fallthru to tail compare
4394     } else if (UseSSE42Intrinsics) {
4395       // With SSE4.2, use double quad vector compare
4396       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4397 
4398       // Compare 16-byte vectors
4399       testl(len, 0xfffffff0);   // vector count (in bytes)
4400       jcc(Assembler::zero, TAIL_START);
4401 
4402       andl(len, 0xfffffff0);
4403       lea(ary1, Address(ary1, len, Address::times_1));
4404       negptr(len);
4405 
4406       movl(tmp1, 0x80808080);
4407       movdl(vec2, tmp1);
4408       pshufd(vec2, vec2, 0);
4409 
4410       bind(COMPARE_WIDE_VECTORS);
4411       movdqu(vec1, Address(ary1, len, Address::times_1));
4412       ptest(vec1, vec2);
4413       jccb(Assembler::notZero, BREAK_LOOP);
4414       addptr(len, 16);
4415       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4416 
4417       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4418       jcc(Assembler::zero, DONE);
4419 
4420       // Quick test using the already prepared vector mask
4421       movl(len, result);
4422       andl(len, 0x0000000f);   // tail count (in bytes)
4423       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4424       ptest(vec1, vec2);
4425       jcc(Assembler::zero, DONE);
4426       jmpb(TAIL_START);
4427 
4428       bind(BREAK_LOOP);
4429       // At least one byte in the last 16-byte vector is negative.
4430       // Set up and look at the last 16 bytes as if they were a tail
4431       lea(ary1, Address(ary1, len, Address::times_1));
4432       addptr(result, len);
4433       // Ignore the very last byte: if all others are positive,
4434       // it must be negative, so we can skip right to the 2+1 byte
4435       // end comparison at this point
4436       orl(result, 15);
4437       movl(len, 15);
4438       // Fallthru to tail compare
4439     }
4440   }
4441 
4442   bind(TAIL_START);
4443   // Compare 4-byte vectors
4444   andl(len, 0xfffffffc); // vector count (in bytes)
4445   jccb(Assembler::zero, COMPARE_CHAR);
4446 
4447   lea(ary1, Address(ary1, len, Address::times_1));
4448   negptr(len);
4449 
4450   bind(COMPARE_VECTORS);
4451   movl(tmp1, Address(ary1, len, Address::times_1));
4452   andl(tmp1, 0x80808080);
4453   jccb(Assembler::notZero, TAIL_ADJUST);
4454   addptr(len, 4);
4455   jccb(Assembler::notZero, COMPARE_VECTORS);
4456 
4457   // Compare trailing char (final 2-3 bytes), if any
4458   bind(COMPARE_CHAR);
4459 
4460   testl(result, 0x2);   // tail  char
4461   jccb(Assembler::zero, COMPARE_BYTE);
4462   load_unsigned_short(tmp1, Address(ary1, 0));
4463   andl(tmp1, 0x00008080);
4464   jccb(Assembler::notZero, CHAR_ADJUST);
4465   lea(ary1, Address(ary1, 2));
4466 
4467   bind(COMPARE_BYTE);
4468   testl(result, 0x1);   // tail  byte
4469   jccb(Assembler::zero, DONE);
4470   load_unsigned_byte(tmp1, Address(ary1, 0));
4471   testl(tmp1, 0x00000080);
4472   jccb(Assembler::zero, DONE);
4473   subptr(result, 1);
4474   jmpb(DONE);
4475 
4476   bind(TAIL_ADJUST);
4477   // there are negative bits in the last 4 byte block.
4478   // Adjust result and check the next three bytes
4479   addptr(result, len);
4480   orl(result, 3);
4481   lea(ary1, Address(ary1, len, Address::times_1));
4482   jmpb(COMPARE_CHAR);
4483 
4484   bind(CHAR_ADJUST);
4485   // We are looking at a char + optional byte tail, and found that one
4486   // of the bytes in the char is negative. Adjust the result, check the
4487   // first byte and readjust if needed.
4488   andl(result, 0xfffffffc);
4489   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4490   jccb(Assembler::notZero, DONE);
4491   addptr(result, 1);
4492 
4493   // That's it
4494   bind(DONE);
4495   if (UseAVX >= 2 && UseSSE >= 2) {
4496     // clean upper bits of YMM registers
4497     vpxor(vec1, vec1);
4498     vpxor(vec2, vec2);
4499   }
4500 }
4501 
4502 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4503 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4504                                       Register limit, Register result, Register chr,
4505                                       XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
4506   ShortBranchVerifier sbv(this);
4507   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4508 
4509   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4510   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4511 
4512   if (is_array_equ) {
4513     // Check the input args
4514     cmpoop(ary1, ary2);
4515     jcc(Assembler::equal, TRUE_LABEL);
4516 
4517     // Need additional checks for arrays_equals.
4518     testptr(ary1, ary1);
4519     jcc(Assembler::zero, FALSE_LABEL);
4520     testptr(ary2, ary2);
4521     jcc(Assembler::zero, FALSE_LABEL);
4522 
4523     // Check the lengths
4524     movl(limit, Address(ary1, length_offset));
4525     cmpl(limit, Address(ary2, length_offset));
4526     jcc(Assembler::notEqual, FALSE_LABEL);
4527   }
4528 
4529   // count == 0
4530   testl(limit, limit);
4531   jcc(Assembler::zero, TRUE_LABEL);
4532 
4533   if (is_array_equ) {
4534     // Load array address
4535     lea(ary1, Address(ary1, base_offset));
4536     lea(ary2, Address(ary2, base_offset));
4537   }
4538 
4539   if (is_array_equ && is_char) {
4540     // arrays_equals when used for char[].
4541     shll(limit, 1);      // byte count != 0
4542   }
4543   movl(result, limit); // copy
4544 
4545   if (UseAVX >= 2) {
4546     // With AVX2, use 32-byte vector compare
4547     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4548 
4549     // Compare 32-byte vectors
4550     andl(result, 0x0000001f);  //   tail count (in bytes)
4551     andl(limit, 0xffffffe0);   // vector count (in bytes)
4552     jcc(Assembler::zero, COMPARE_TAIL);
4553 
4554     lea(ary1, Address(ary1, limit, Address::times_1));
4555     lea(ary2, Address(ary2, limit, Address::times_1));
4556     negptr(limit);
4557 
4558 #ifdef _LP64
4559     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4560       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4561 
4562       cmpl(limit, -64);
4563       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4564 
4565       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4566 
4567       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4568       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4569       kortestql(mask, mask);
4570       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4571       addptr(limit, 64);  // update since we already compared at this addr
4572       cmpl(limit, -64);
4573       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4574 
4575       // At this point we may still need to compare -limit+result bytes.
4576       // We could execute the next two instruction and just continue via non-wide path:
4577       //  cmpl(limit, 0);
4578       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4579       // But since we stopped at the points ary{1,2}+limit which are
4580       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4581       // (|limit| <= 32 and result < 32),
4582       // we may just compare the last 64 bytes.
4583       //
4584       addptr(result, -64);   // it is safe, bc we just came from this area
4585       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4586       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4587       kortestql(mask, mask);
4588       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4589 
4590       jmp(TRUE_LABEL);
4591 
4592       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4593 
4594     }//if (VM_Version::supports_avx512vlbw())
4595 #endif //_LP64
4596     bind(COMPARE_WIDE_VECTORS);
4597     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
4598     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4599     vpxor(vec1, vec2);
4600 
4601     vptest(vec1, vec1);
4602     jcc(Assembler::notZero, FALSE_LABEL);
4603     addptr(limit, 32);
4604     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4605 
4606     testl(result, result);
4607     jcc(Assembler::zero, TRUE_LABEL);
4608 
4609     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
4610     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4611     vpxor(vec1, vec2);
4612 
4613     vptest(vec1, vec1);
4614     jccb(Assembler::notZero, FALSE_LABEL);
4615     jmpb(TRUE_LABEL);
4616 
4617     bind(COMPARE_TAIL); // limit is zero
4618     movl(limit, result);
4619     // Fallthru to tail compare
4620   } else if (UseSSE42Intrinsics) {
4621     // With SSE4.2, use double quad vector compare
4622     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4623 
4624     // Compare 16-byte vectors
4625     andl(result, 0x0000000f);  //   tail count (in bytes)
4626     andl(limit, 0xfffffff0);   // vector count (in bytes)
4627     jcc(Assembler::zero, COMPARE_TAIL);
4628 
4629     lea(ary1, Address(ary1, limit, Address::times_1));
4630     lea(ary2, Address(ary2, limit, Address::times_1));
4631     negptr(limit);
4632 
4633     bind(COMPARE_WIDE_VECTORS);
4634     movdqu(vec1, Address(ary1, limit, Address::times_1));
4635     movdqu(vec2, Address(ary2, limit, Address::times_1));
4636     pxor(vec1, vec2);
4637 
4638     ptest(vec1, vec1);
4639     jcc(Assembler::notZero, FALSE_LABEL);
4640     addptr(limit, 16);
4641     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4642 
4643     testl(result, result);
4644     jcc(Assembler::zero, TRUE_LABEL);
4645 
4646     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4647     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4648     pxor(vec1, vec2);
4649 
4650     ptest(vec1, vec1);
4651     jccb(Assembler::notZero, FALSE_LABEL);
4652     jmpb(TRUE_LABEL);
4653 
4654     bind(COMPARE_TAIL); // limit is zero
4655     movl(limit, result);
4656     // Fallthru to tail compare
4657   }
4658 
4659   // Compare 4-byte vectors
4660   andl(limit, 0xfffffffc); // vector count (in bytes)
4661   jccb(Assembler::zero, COMPARE_CHAR);
4662 
4663   lea(ary1, Address(ary1, limit, Address::times_1));
4664   lea(ary2, Address(ary2, limit, Address::times_1));
4665   negptr(limit);
4666 
4667   bind(COMPARE_VECTORS);
4668   movl(chr, Address(ary1, limit, Address::times_1));
4669   cmpl(chr, Address(ary2, limit, Address::times_1));
4670   jccb(Assembler::notEqual, FALSE_LABEL);
4671   addptr(limit, 4);
4672   jcc(Assembler::notZero, COMPARE_VECTORS);
4673 
4674   // Compare trailing char (final 2 bytes), if any
4675   bind(COMPARE_CHAR);
4676   testl(result, 0x2);   // tail  char
4677   jccb(Assembler::zero, COMPARE_BYTE);
4678   load_unsigned_short(chr, Address(ary1, 0));
4679   load_unsigned_short(limit, Address(ary2, 0));
4680   cmpl(chr, limit);
4681   jccb(Assembler::notEqual, FALSE_LABEL);
4682 
4683   if (is_array_equ && is_char) {
4684     bind(COMPARE_BYTE);
4685   } else {
4686     lea(ary1, Address(ary1, 2));
4687     lea(ary2, Address(ary2, 2));
4688 
4689     bind(COMPARE_BYTE);
4690     testl(result, 0x1);   // tail  byte
4691     jccb(Assembler::zero, TRUE_LABEL);
4692     load_unsigned_byte(chr, Address(ary1, 0));
4693     load_unsigned_byte(limit, Address(ary2, 0));
4694     cmpl(chr, limit);
4695     jccb(Assembler::notEqual, FALSE_LABEL);
4696   }
4697   bind(TRUE_LABEL);
4698   movl(result, 1);   // return true
4699   jmpb(DONE);
4700 
4701   bind(FALSE_LABEL);
4702   xorl(result, result); // return false
4703 
4704   // That's it
4705   bind(DONE);
4706   if (UseAVX >= 2) {
4707     // clean upper bits of YMM registers
4708     vpxor(vec1, vec1);
4709     vpxor(vec2, vec2);
4710   }
4711 }
4712 
4713 #ifdef _LP64
4714 
4715 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4716 #define __ masm.
4717   Register dst = stub.data<0>();
4718   XMMRegister src = stub.data<1>();
4719   address target = stub.data<2>();
4720   __ bind(stub.entry());
4721   __ subptr(rsp, 8);
4722   __ movdbl(Address(rsp), src);
4723   __ call(RuntimeAddress(target));
4724   __ pop(dst);
4725   __ jmp(stub.continuation());
4726 #undef __
4727 }
4728 
4729 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4730   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4731   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4732 
4733   address slowpath_target;
4734   if (dst_bt == T_INT) {
4735     if (src_bt == T_FLOAT) {
4736       cvttss2sil(dst, src);
4737       cmpl(dst, 0x80000000);
4738       slowpath_target = StubRoutines::x86::f2i_fixup();
4739     } else {
4740       cvttsd2sil(dst, src);
4741       cmpl(dst, 0x80000000);
4742       slowpath_target = StubRoutines::x86::d2i_fixup();
4743     }
4744   } else {
4745     if (src_bt == T_FLOAT) {
4746       cvttss2siq(dst, src);
4747       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4748       slowpath_target = StubRoutines::x86::f2l_fixup();
4749     } else {
4750       cvttsd2siq(dst, src);
4751       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4752       slowpath_target = StubRoutines::x86::d2l_fixup();
4753     }
4754   }
4755 
4756   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4757   jcc(Assembler::equal, stub->entry());
4758   bind(stub->continuation());
4759 }
4760 
4761 #endif // _LP64
4762 
4763 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4764                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4765   switch(ideal_opc) {
4766     case Op_LShiftVS:
4767       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4768     case Op_LShiftVI:
4769       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4770     case Op_LShiftVL:
4771       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4772     case Op_RShiftVS:
4773       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4774     case Op_RShiftVI:
4775       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4776     case Op_RShiftVL:
4777       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4778     case Op_URShiftVS:
4779       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4780     case Op_URShiftVI:
4781       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4782     case Op_URShiftVL:
4783       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4784     case Op_RotateRightV:
4785       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4786     case Op_RotateLeftV:
4787       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4788     default:
4789       fatal("Unsupported masked operation"); break;
4790   }
4791 }
4792 
4793 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4794                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4795                                     bool is_varshift) {
4796   switch (ideal_opc) {
4797     case Op_AddVB:
4798       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4799     case Op_AddVS:
4800       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4801     case Op_AddVI:
4802       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4803     case Op_AddVL:
4804       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4805     case Op_AddVF:
4806       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4807     case Op_AddVD:
4808       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4809     case Op_SubVB:
4810       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4811     case Op_SubVS:
4812       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4813     case Op_SubVI:
4814       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4815     case Op_SubVL:
4816       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4817     case Op_SubVF:
4818       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4819     case Op_SubVD:
4820       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4821     case Op_MulVS:
4822       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4823     case Op_MulVI:
4824       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4825     case Op_MulVL:
4826       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4827     case Op_MulVF:
4828       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4829     case Op_MulVD:
4830       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4831     case Op_DivVF:
4832       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4833     case Op_DivVD:
4834       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4835     case Op_SqrtVF:
4836       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4837     case Op_SqrtVD:
4838       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4839     case Op_AbsVB:
4840       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4841     case Op_AbsVS:
4842       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4843     case Op_AbsVI:
4844       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4845     case Op_AbsVL:
4846       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4847     case Op_FmaVF:
4848       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4849     case Op_FmaVD:
4850       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4851     case Op_VectorRearrange:
4852       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4853     case Op_LShiftVS:
4854       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4855     case Op_LShiftVI:
4856       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4857     case Op_LShiftVL:
4858       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4859     case Op_RShiftVS:
4860       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4861     case Op_RShiftVI:
4862       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4863     case Op_RShiftVL:
4864       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4865     case Op_URShiftVS:
4866       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4867     case Op_URShiftVI:
4868       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4869     case Op_URShiftVL:
4870       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4871     case Op_RotateLeftV:
4872       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4873     case Op_RotateRightV:
4874       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4875     case Op_MaxV:
4876       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4877     case Op_MinV:
4878       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4879     case Op_XorV:
4880       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4881     case Op_OrV:
4882       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4883     case Op_AndV:
4884       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4885     default:
4886       fatal("Unsupported masked operation"); break;
4887   }
4888 }
4889 
4890 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4891                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4892   switch (ideal_opc) {
4893     case Op_AddVB:
4894       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4895     case Op_AddVS:
4896       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4897     case Op_AddVI:
4898       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4899     case Op_AddVL:
4900       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4901     case Op_AddVF:
4902       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4903     case Op_AddVD:
4904       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4905     case Op_SubVB:
4906       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4907     case Op_SubVS:
4908       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4909     case Op_SubVI:
4910       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4911     case Op_SubVL:
4912       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4913     case Op_SubVF:
4914       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4915     case Op_SubVD:
4916       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4917     case Op_MulVS:
4918       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4919     case Op_MulVI:
4920       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4921     case Op_MulVL:
4922       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4923     case Op_MulVF:
4924       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4925     case Op_MulVD:
4926       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4927     case Op_DivVF:
4928       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4929     case Op_DivVD:
4930       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4931     case Op_FmaVF:
4932       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4933     case Op_FmaVD:
4934       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4935     case Op_MaxV:
4936       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4937     case Op_MinV:
4938       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4939     case Op_XorV:
4940       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4941     case Op_OrV:
4942       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4943     case Op_AndV:
4944       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4945     default:
4946       fatal("Unsupported masked operation"); break;
4947   }
4948 }
4949 
4950 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4951                                   KRegister src1, KRegister src2) {
4952   BasicType etype = T_ILLEGAL;
4953   switch(mask_len) {
4954     case 2:
4955     case 4:
4956     case 8:  etype = T_BYTE; break;
4957     case 16: etype = T_SHORT; break;
4958     case 32: etype = T_INT; break;
4959     case 64: etype = T_LONG; break;
4960     default: fatal("Unsupported type"); break;
4961   }
4962   assert(etype != T_ILLEGAL, "");
4963   switch(ideal_opc) {
4964     case Op_AndVMask:
4965       kand(etype, dst, src1, src2); break;
4966     case Op_OrVMask:
4967       kor(etype, dst, src1, src2); break;
4968     case Op_XorVMask:
4969       kxor(etype, dst, src1, src2); break;
4970     default:
4971       fatal("Unsupported masked operation"); break;
4972   }
4973 }
4974 
4975 /*
4976  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4977  * If src is NaN, the result is 0.
4978  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4979  * the result is equal to the value of Integer.MIN_VALUE.
4980  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4981  * the result is equal to the value of Integer.MAX_VALUE.
4982  */
4983 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4984                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4985                                                                    Register rscratch, AddressLiteral float_sign_flip,
4986                                                                    int vec_enc) {
4987   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4988   Label done;
4989   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4990   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4991   vptest(xtmp2, xtmp2, vec_enc);
4992   jccb(Assembler::equal, done);
4993 
4994   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4995   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4996 
4997   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4998   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4999   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
5000 
5001   // Recompute the mask for remaining special value.
5002   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
5003   // Extract SRC values corresponding to TRUE mask lanes.
5004   vpand(xtmp4, xtmp2, src, vec_enc);
5005   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
5006   // values are set.
5007   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
5008 
5009   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
5010   bind(done);
5011 }
5012 
5013 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5014                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5015                                                                     Register rscratch, AddressLiteral float_sign_flip,
5016                                                                     int vec_enc) {
5017   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5018   Label done;
5019   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5020   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5021   kortestwl(ktmp1, ktmp1);
5022   jccb(Assembler::equal, done);
5023 
5024   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5025   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5026   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5027 
5028   kxorwl(ktmp1, ktmp1, ktmp2);
5029   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5030   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5031   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5032   bind(done);
5033 }
5034 
5035 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5036                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5037                                                                      Register rscratch, AddressLiteral double_sign_flip,
5038                                                                      int vec_enc) {
5039   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5040 
5041   Label done;
5042   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5043   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
5044   kortestwl(ktmp1, ktmp1);
5045   jccb(Assembler::equal, done);
5046 
5047   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5048   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5049   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5050 
5051   kxorwl(ktmp1, ktmp1, ktmp2);
5052   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5053   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5054   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5055   bind(done);
5056 }
5057 
5058 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5059                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5060                                                                      Register rscratch, AddressLiteral float_sign_flip,
5061                                                                      int vec_enc) {
5062   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5063   Label done;
5064   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5065   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5066   kortestwl(ktmp1, ktmp1);
5067   jccb(Assembler::equal, done);
5068 
5069   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5070   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5071   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5072 
5073   kxorwl(ktmp1, ktmp1, ktmp2);
5074   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5075   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5076   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5077   bind(done);
5078 }
5079 
5080 /*
5081  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5082  * If src is NaN, the result is 0.
5083  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5084  * the result is equal to the value of Long.MIN_VALUE.
5085  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5086  * the result is equal to the value of Long.MAX_VALUE.
5087  */
5088 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5089                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5090                                                                       Register rscratch, AddressLiteral double_sign_flip,
5091                                                                       int vec_enc) {
5092   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5093 
5094   Label done;
5095   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5096   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5097   kortestwl(ktmp1, ktmp1);
5098   jccb(Assembler::equal, done);
5099 
5100   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5101   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5102   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5103 
5104   kxorwl(ktmp1, ktmp1, ktmp2);
5105   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5106   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5107   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5108   bind(done);
5109 }
5110 
5111 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5112                                                              XMMRegister xtmp, int index, int vec_enc) {
5113    assert(vec_enc < Assembler::AVX_512bit, "");
5114    if (vec_enc == Assembler::AVX_256bit) {
5115      vextractf128_high(xtmp, src);
5116      vshufps(dst, src, xtmp, index, vec_enc);
5117    } else {
5118      vshufps(dst, src, zero, index, vec_enc);
5119    }
5120 }
5121 
5122 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5123                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5124                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5125   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5126 
5127   Label done;
5128   // Compare the destination lanes with float_sign_flip
5129   // value to get mask for all special values.
5130   movdqu(xtmp1, float_sign_flip, rscratch);
5131   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5132   ptest(xtmp2, xtmp2);
5133   jccb(Assembler::equal, done);
5134 
5135   // Flip float_sign_flip to get max integer value.
5136   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5137   pxor(xtmp1, xtmp4);
5138 
5139   // Set detination lanes corresponding to unordered source lanes as zero.
5140   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5141   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5142 
5143   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5144   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5145   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5146 
5147   // Recompute the mask for remaining special value.
5148   pxor(xtmp2, xtmp3);
5149   // Extract mask corresponding to non-negative source lanes.
5150   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5151 
5152   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5153   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5154   pand(xtmp3, xtmp2);
5155 
5156   // Replace destination lanes holding special value(0x80000000) with max int
5157   // if corresponding source lane holds a +ve value.
5158   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5159   bind(done);
5160 }
5161 
5162 
5163 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5164                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5165   switch(to_elem_bt) {
5166     case T_SHORT:
5167       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5168       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5169       vpackusdw(dst, dst, zero, vec_enc);
5170       if (vec_enc == Assembler::AVX_256bit) {
5171         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5172       }
5173       break;
5174     case  T_BYTE:
5175       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5176       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5177       vpackusdw(dst, dst, zero, vec_enc);
5178       if (vec_enc == Assembler::AVX_256bit) {
5179         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5180       }
5181       vpackuswb(dst, dst, zero, vec_enc);
5182       break;
5183     default: assert(false, "%s", type2name(to_elem_bt));
5184   }
5185 }
5186 
5187 /*
5188  * Algorithm for vector D2L and F2I conversions:-
5189  * a) Perform vector D2L/F2I cast.
5190  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5191  *    It signifies that source value could be any of the special floating point
5192  *    values(NaN,-Inf,Inf,Max,-Min).
5193  * c) Set destination to zero if source is NaN value.
5194  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5195  */
5196 
5197 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5198                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5199                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5200   int to_elem_sz = type2aelembytes(to_elem_bt);
5201   assert(to_elem_sz <= 4, "");
5202   vcvttps2dq(dst, src, vec_enc);
5203   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5204   if (to_elem_sz < 4) {
5205     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5206     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5207   }
5208 }
5209 
5210 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5211                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5212                                             Register rscratch, int vec_enc) {
5213   int to_elem_sz = type2aelembytes(to_elem_bt);
5214   assert(to_elem_sz <= 4, "");
5215   vcvttps2dq(dst, src, vec_enc);
5216   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5217   switch(to_elem_bt) {
5218     case T_INT:
5219       break;
5220     case T_SHORT:
5221       evpmovdw(dst, dst, vec_enc);
5222       break;
5223     case T_BYTE:
5224       evpmovdb(dst, dst, vec_enc);
5225       break;
5226     default: assert(false, "%s", type2name(to_elem_bt));
5227   }
5228 }
5229 
5230 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5231                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5232                                             Register rscratch, int vec_enc) {
5233   evcvttps2qq(dst, src, vec_enc);
5234   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5235 }
5236 
5237 // Handling for downcasting from double to integer or sub-word types on AVX2.
5238 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5239                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5240                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5241   int to_elem_sz = type2aelembytes(to_elem_bt);
5242   assert(to_elem_sz < 8, "");
5243   vcvttpd2dq(dst, src, vec_enc);
5244   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5245                                               float_sign_flip, vec_enc);
5246   if (to_elem_sz < 4) {
5247     // xtmp4 holds all zero lanes.
5248     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5249   }
5250 }
5251 
5252 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5253                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5254                                             KRegister ktmp2, AddressLiteral sign_flip,
5255                                             Register rscratch, int vec_enc) {
5256   if (VM_Version::supports_avx512dq()) {
5257     evcvttpd2qq(dst, src, vec_enc);
5258     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5259     switch(to_elem_bt) {
5260       case T_LONG:
5261         break;
5262       case T_INT:
5263         evpmovsqd(dst, dst, vec_enc);
5264         break;
5265       case T_SHORT:
5266         evpmovsqd(dst, dst, vec_enc);
5267         evpmovdw(dst, dst, vec_enc);
5268         break;
5269       case T_BYTE:
5270         evpmovsqd(dst, dst, vec_enc);
5271         evpmovdb(dst, dst, vec_enc);
5272         break;
5273       default: assert(false, "%s", type2name(to_elem_bt));
5274     }
5275   } else {
5276     assert(type2aelembytes(to_elem_bt) <= 4, "");
5277     vcvttpd2dq(dst, src, vec_enc);
5278     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5279     switch(to_elem_bt) {
5280       case T_INT:
5281         break;
5282       case T_SHORT:
5283         evpmovdw(dst, dst, vec_enc);
5284         break;
5285       case T_BYTE:
5286         evpmovdb(dst, dst, vec_enc);
5287         break;
5288       default: assert(false, "%s", type2name(to_elem_bt));
5289     }
5290   }
5291 }
5292 
5293 #ifdef _LP64
5294 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5295                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5296                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5297   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5298   // and re-instantiate original MXCSR.RC mode after that.
5299   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5300 
5301   mov64(tmp, julong_cast(0.5L));
5302   evpbroadcastq(xtmp1, tmp, vec_enc);
5303   vaddpd(xtmp1, src , xtmp1, vec_enc);
5304   evcvtpd2qq(dst, xtmp1, vec_enc);
5305   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5306                                                 double_sign_flip, vec_enc);;
5307 
5308   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5309 }
5310 
5311 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5312                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5313                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5314   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5315   // and re-instantiate original MXCSR.RC mode after that.
5316   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5317 
5318   movl(tmp, jint_cast(0.5));
5319   movq(xtmp1, tmp);
5320   vbroadcastss(xtmp1, xtmp1, vec_enc);
5321   vaddps(xtmp1, src , xtmp1, vec_enc);
5322   vcvtps2dq(dst, xtmp1, vec_enc);
5323   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5324                                               float_sign_flip, vec_enc);
5325 
5326   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5327 }
5328 
5329 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5330                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5331                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5332   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5333   // and re-instantiate original MXCSR.RC mode after that.
5334   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5335 
5336   movl(tmp, jint_cast(0.5));
5337   movq(xtmp1, tmp);
5338   vbroadcastss(xtmp1, xtmp1, vec_enc);
5339   vaddps(xtmp1, src , xtmp1, vec_enc);
5340   vcvtps2dq(dst, xtmp1, vec_enc);
5341   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5342 
5343   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5344 }
5345 #endif // _LP64
5346 
5347 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5348                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5349   switch (from_elem_bt) {
5350     case T_BYTE:
5351       switch (to_elem_bt) {
5352         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5353         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5354         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5355         default: ShouldNotReachHere();
5356       }
5357       break;
5358     case T_SHORT:
5359       switch (to_elem_bt) {
5360         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5361         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5362         default: ShouldNotReachHere();
5363       }
5364       break;
5365     case T_INT:
5366       assert(to_elem_bt == T_LONG, "");
5367       vpmovzxdq(dst, src, vlen_enc);
5368       break;
5369     default:
5370       ShouldNotReachHere();
5371   }
5372 }
5373 
5374 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5375                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5376   switch (from_elem_bt) {
5377     case T_BYTE:
5378       switch (to_elem_bt) {
5379         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5380         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5381         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5382         default: ShouldNotReachHere();
5383       }
5384       break;
5385     case T_SHORT:
5386       switch (to_elem_bt) {
5387         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5388         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5389         default: ShouldNotReachHere();
5390       }
5391       break;
5392     case T_INT:
5393       assert(to_elem_bt == T_LONG, "");
5394       vpmovsxdq(dst, src, vlen_enc);
5395       break;
5396     default:
5397       ShouldNotReachHere();
5398   }
5399 }
5400 
5401 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5402                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5403   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5404   assert(vlen_enc != AVX_512bit, "");
5405 
5406   int dst_bt_size = type2aelembytes(dst_bt);
5407   int src_bt_size = type2aelembytes(src_bt);
5408   if (dst_bt_size > src_bt_size) {
5409     switch (dst_bt_size / src_bt_size) {
5410       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5411       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5412       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5413       default: ShouldNotReachHere();
5414     }
5415   } else {
5416     assert(dst_bt_size < src_bt_size, "");
5417     switch (src_bt_size / dst_bt_size) {
5418       case 2: {
5419         if (vlen_enc == AVX_128bit) {
5420           vpacksswb(dst, src, src, vlen_enc);
5421         } else {
5422           vpacksswb(dst, src, src, vlen_enc);
5423           vpermq(dst, dst, 0x08, vlen_enc);
5424         }
5425         break;
5426       }
5427       case 4: {
5428         if (vlen_enc == AVX_128bit) {
5429           vpackssdw(dst, src, src, vlen_enc);
5430           vpacksswb(dst, dst, dst, vlen_enc);
5431         } else {
5432           vpackssdw(dst, src, src, vlen_enc);
5433           vpermq(dst, dst, 0x08, vlen_enc);
5434           vpacksswb(dst, dst, dst, AVX_128bit);
5435         }
5436         break;
5437       }
5438       case 8: {
5439         if (vlen_enc == AVX_128bit) {
5440           vpshufd(dst, src, 0x08, vlen_enc);
5441           vpackssdw(dst, dst, dst, vlen_enc);
5442           vpacksswb(dst, dst, dst, vlen_enc);
5443         } else {
5444           vpshufd(dst, src, 0x08, vlen_enc);
5445           vpermq(dst, dst, 0x08, vlen_enc);
5446           vpackssdw(dst, dst, dst, AVX_128bit);
5447           vpacksswb(dst, dst, dst, AVX_128bit);
5448         }
5449         break;
5450       }
5451       default: ShouldNotReachHere();
5452     }
5453   }
5454 }
5455 
5456 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5457                                    bool merge, BasicType bt, int vlen_enc) {
5458   if (bt == T_INT) {
5459     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5460   } else {
5461     assert(bt == T_LONG, "");
5462     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5463   }
5464 }
5465 
5466 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5467                                    bool merge, BasicType bt, int vlen_enc) {
5468   if (bt == T_INT) {
5469     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5470   } else {
5471     assert(bt == T_LONG, "");
5472     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5473   }
5474 }
5475 
5476 #ifdef _LP64
5477 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5478                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5479                                                int vec_enc) {
5480   int index = 0;
5481   int vindex = 0;
5482   mov64(rtmp1, 0x0101010101010101L);
5483   pdepq(rtmp1, src, rtmp1);
5484   if (mask_len > 8) {
5485     movq(rtmp2, src);
5486     vpxor(xtmp, xtmp, xtmp, vec_enc);
5487     movq(xtmp, rtmp1);
5488   }
5489   movq(dst, rtmp1);
5490 
5491   mask_len -= 8;
5492   while (mask_len > 0) {
5493     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5494     index++;
5495     if ((index % 2) == 0) {
5496       pxor(xtmp, xtmp);
5497     }
5498     mov64(rtmp1, 0x0101010101010101L);
5499     shrq(rtmp2, 8);
5500     pdepq(rtmp1, rtmp2, rtmp1);
5501     pinsrq(xtmp, rtmp1, index % 2);
5502     vindex = index / 2;
5503     if (vindex) {
5504       // Write entire 16 byte vector when both 64 bit
5505       // lanes are update to save redundant instructions.
5506       if (index % 2) {
5507         vinsertf128(dst, dst, xtmp, vindex);
5508       }
5509     } else {
5510       vmovdqu(dst, xtmp);
5511     }
5512     mask_len -= 8;
5513   }
5514 }
5515 
5516 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5517   switch(opc) {
5518     case Op_VectorMaskTrueCount:
5519       popcntq(dst, tmp);
5520       break;
5521     case Op_VectorMaskLastTrue:
5522       if (VM_Version::supports_lzcnt()) {
5523         lzcntq(tmp, tmp);
5524         movl(dst, 63);
5525         subl(dst, tmp);
5526       } else {
5527         movl(dst, -1);
5528         bsrq(tmp, tmp);
5529         cmov32(Assembler::notZero, dst, tmp);
5530       }
5531       break;
5532     case Op_VectorMaskFirstTrue:
5533       if (VM_Version::supports_bmi1()) {
5534         if (masklen < 32) {
5535           orl(tmp, 1 << masklen);
5536           tzcntl(dst, tmp);
5537         } else if (masklen == 32) {
5538           tzcntl(dst, tmp);
5539         } else {
5540           assert(masklen == 64, "");
5541           tzcntq(dst, tmp);
5542         }
5543       } else {
5544         if (masklen < 32) {
5545           orl(tmp, 1 << masklen);
5546           bsfl(dst, tmp);
5547         } else {
5548           assert(masklen == 32 || masklen == 64, "");
5549           movl(dst, masklen);
5550           if (masklen == 32)  {
5551             bsfl(tmp, tmp);
5552           } else {
5553             bsfq(tmp, tmp);
5554           }
5555           cmov32(Assembler::notZero, dst, tmp);
5556         }
5557       }
5558       break;
5559     case Op_VectorMaskToLong:
5560       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5561       break;
5562     default: assert(false, "Unhandled mask operation");
5563   }
5564 }
5565 
5566 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5567                                               int masklen, int masksize, int vec_enc) {
5568   assert(VM_Version::supports_popcnt(), "");
5569 
5570   if(VM_Version::supports_avx512bw()) {
5571     kmovql(tmp, mask);
5572   } else {
5573     assert(masklen <= 16, "");
5574     kmovwl(tmp, mask);
5575   }
5576 
5577   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5578   // operations needs to be clipped.
5579   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5580     andq(tmp, (1 << masklen) - 1);
5581   }
5582 
5583   vector_mask_operation_helper(opc, dst, tmp, masklen);
5584 }
5585 
5586 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5587                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5588   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5589          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5590   assert(VM_Version::supports_popcnt(), "");
5591 
5592   bool need_clip = false;
5593   switch(bt) {
5594     case T_BOOLEAN:
5595       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5596       vpxor(xtmp, xtmp, xtmp, vec_enc);
5597       vpsubb(xtmp, xtmp, mask, vec_enc);
5598       vpmovmskb(tmp, xtmp, vec_enc);
5599       need_clip = masklen < 16;
5600       break;
5601     case T_BYTE:
5602       vpmovmskb(tmp, mask, vec_enc);
5603       need_clip = masklen < 16;
5604       break;
5605     case T_SHORT:
5606       vpacksswb(xtmp, mask, mask, vec_enc);
5607       if (masklen >= 16) {
5608         vpermpd(xtmp, xtmp, 8, vec_enc);
5609       }
5610       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5611       need_clip = masklen < 16;
5612       break;
5613     case T_INT:
5614     case T_FLOAT:
5615       vmovmskps(tmp, mask, vec_enc);
5616       need_clip = masklen < 4;
5617       break;
5618     case T_LONG:
5619     case T_DOUBLE:
5620       vmovmskpd(tmp, mask, vec_enc);
5621       need_clip = masklen < 2;
5622       break;
5623     default: assert(false, "Unhandled type, %s", type2name(bt));
5624   }
5625 
5626   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5627   // operations needs to be clipped.
5628   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5629     // need_clip implies masklen < 32
5630     andq(tmp, (1 << masklen) - 1);
5631   }
5632 
5633   vector_mask_operation_helper(opc, dst, tmp, masklen);
5634 }
5635 
5636 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5637                                              Register rtmp2, int mask_len) {
5638   kmov(rtmp1, src);
5639   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5640   mov64(rtmp2, -1L);
5641   pextq(rtmp2, rtmp2, rtmp1);
5642   kmov(dst, rtmp2);
5643 }
5644 
5645 #ifdef _LP64
5646 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5647                                                     XMMRegister mask, Register rtmp, Register rscratch,
5648                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5649                                                     int vec_enc) {
5650   assert(type2aelembytes(bt) >= 4, "");
5651   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5652   address compress_perm_table = nullptr;
5653   address expand_perm_table = nullptr;
5654   if (type2aelembytes(bt) == 8) {
5655     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5656     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5657     vmovmskpd(rtmp, mask, vec_enc);
5658   } else {
5659     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5660     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5661     vmovmskps(rtmp, mask, vec_enc);
5662   }
5663   shlq(rtmp, 5); // for 32 byte permute row.
5664   if (opcode == Op_CompressV) {
5665     lea(rscratch, ExternalAddress(compress_perm_table));
5666   } else {
5667     lea(rscratch, ExternalAddress(expand_perm_table));
5668   }
5669   addptr(rtmp, rscratch);
5670   vmovdqu(permv, Address(rtmp));
5671   vpermps(dst, permv, src, Assembler::AVX_256bit);
5672   vpxor(xtmp, xtmp, xtmp, vec_enc);
5673   // Blend the result with zero vector using permute mask, each column entry
5674   // in a permute table row contains either a valid permute index or a -1 (default)
5675   // value, this can potentially be used as a blending mask after
5676   // compressing/expanding the source vector lanes.
5677   vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
5678 }
5679 #endif
5680 
5681 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5682                                                bool merge, BasicType bt, int vec_enc) {
5683   if (opcode == Op_CompressV) {
5684     switch(bt) {
5685     case T_BYTE:
5686       evpcompressb(dst, mask, src, merge, vec_enc);
5687       break;
5688     case T_CHAR:
5689     case T_SHORT:
5690       evpcompressw(dst, mask, src, merge, vec_enc);
5691       break;
5692     case T_INT:
5693       evpcompressd(dst, mask, src, merge, vec_enc);
5694       break;
5695     case T_FLOAT:
5696       evcompressps(dst, mask, src, merge, vec_enc);
5697       break;
5698     case T_LONG:
5699       evpcompressq(dst, mask, src, merge, vec_enc);
5700       break;
5701     case T_DOUBLE:
5702       evcompresspd(dst, mask, src, merge, vec_enc);
5703       break;
5704     default:
5705       fatal("Unsupported type %s", type2name(bt));
5706       break;
5707     }
5708   } else {
5709     assert(opcode == Op_ExpandV, "");
5710     switch(bt) {
5711     case T_BYTE:
5712       evpexpandb(dst, mask, src, merge, vec_enc);
5713       break;
5714     case T_CHAR:
5715     case T_SHORT:
5716       evpexpandw(dst, mask, src, merge, vec_enc);
5717       break;
5718     case T_INT:
5719       evpexpandd(dst, mask, src, merge, vec_enc);
5720       break;
5721     case T_FLOAT:
5722       evexpandps(dst, mask, src, merge, vec_enc);
5723       break;
5724     case T_LONG:
5725       evpexpandq(dst, mask, src, merge, vec_enc);
5726       break;
5727     case T_DOUBLE:
5728       evexpandpd(dst, mask, src, merge, vec_enc);
5729       break;
5730     default:
5731       fatal("Unsupported type %s", type2name(bt));
5732       break;
5733     }
5734   }
5735 }
5736 #endif
5737 
5738 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5739                                            KRegister ktmp1, int vec_enc) {
5740   if (opcode == Op_SignumVD) {
5741     vsubpd(dst, zero, one, vec_enc);
5742     // if src < 0 ? -1 : 1
5743     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5744     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5745     // if src == NaN, -0.0 or 0.0 return src.
5746     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5747     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5748   } else {
5749     assert(opcode == Op_SignumVF, "");
5750     vsubps(dst, zero, one, vec_enc);
5751     // if src < 0 ? -1 : 1
5752     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5753     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5754     // if src == NaN, -0.0 or 0.0 return src.
5755     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5756     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5757   }
5758 }
5759 
5760 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5761                                           XMMRegister xtmp1, int vec_enc) {
5762   if (opcode == Op_SignumVD) {
5763     vsubpd(dst, zero, one, vec_enc);
5764     // if src < 0 ? -1 : 1
5765     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5766     // if src == NaN, -0.0 or 0.0 return src.
5767     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5768     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5769   } else {
5770     assert(opcode == Op_SignumVF, "");
5771     vsubps(dst, zero, one, vec_enc);
5772     // if src < 0 ? -1 : 1
5773     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5774     // if src == NaN, -0.0 or 0.0 return src.
5775     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5776     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5777   }
5778 }
5779 
5780 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5781   if (VM_Version::supports_avx512bw()) {
5782     if (mask_len > 32) {
5783       kmovql(dst, src);
5784     } else {
5785       kmovdl(dst, src);
5786       if (mask_len != 32) {
5787         kshiftrdl(dst, dst, 32 - mask_len);
5788       }
5789     }
5790   } else {
5791     assert(mask_len <= 16, "");
5792     kmovwl(dst, src);
5793     if (mask_len != 16) {
5794       kshiftrwl(dst, dst, 16 - mask_len);
5795     }
5796   }
5797 }
5798 
5799 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5800   int lane_size = type2aelembytes(bt);
5801   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5802   if ((is_LP64 || lane_size < 8) &&
5803       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5804        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5805     movptr(rtmp, imm32);
5806     switch(lane_size) {
5807       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5808       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5809       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5810       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5811       fatal("Unsupported lane size %d", lane_size);
5812       break;
5813     }
5814   } else {
5815     movptr(rtmp, imm32);
5816     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5817     switch(lane_size) {
5818       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5819       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5820       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5821       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5822       fatal("Unsupported lane size %d", lane_size);
5823       break;
5824     }
5825   }
5826 }
5827 
5828 //
5829 // Following is lookup table based popcount computation algorithm:-
5830 //       Index   Bit set count
5831 //     [ 0000 ->   0,
5832 //       0001 ->   1,
5833 //       0010 ->   1,
5834 //       0011 ->   2,
5835 //       0100 ->   1,
5836 //       0101 ->   2,
5837 //       0110 ->   2,
5838 //       0111 ->   3,
5839 //       1000 ->   1,
5840 //       1001 ->   2,
5841 //       1010 ->   3,
5842 //       1011 ->   3,
5843 //       1100 ->   2,
5844 //       1101 ->   3,
5845 //       1111 ->   4 ]
5846 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5847 //     shuffle indices for lookup table access.
5848 //  b. Right shift each byte of vector lane by 4 positions.
5849 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5850 //     shuffle indices for lookup table access.
5851 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5852 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5853 //     count of all the bytes of a quadword.
5854 //  f. Perform step e. for upper 128bit vector lane.
5855 //  g. Pack the bitset count of quadwords back to double word.
5856 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5857 
5858 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5859                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5860   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5861   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5862   vpsrlw(dst, src, 4, vec_enc);
5863   vpand(dst, dst, xtmp1, vec_enc);
5864   vpand(xtmp1, src, xtmp1, vec_enc);
5865   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5866   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5867   vpshufb(dst, xtmp2, dst, vec_enc);
5868   vpaddb(dst, dst, xtmp1, vec_enc);
5869 }
5870 
5871 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5872                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5873   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5874   // Following code is as per steps e,f,g and h of above algorithm.
5875   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5876   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5877   vpsadbw(dst, dst, xtmp2, vec_enc);
5878   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5879   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5880   vpackuswb(dst, xtmp1, dst, vec_enc);
5881 }
5882 
5883 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5884                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5885   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5886   // Add the popcount of upper and lower bytes of word.
5887   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5888   vpsrlw(dst, xtmp1, 8, vec_enc);
5889   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5890   vpaddw(dst, dst, xtmp1, vec_enc);
5891 }
5892 
5893 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5894                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5895   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5896   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5897   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5898 }
5899 
5900 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5901                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5902   switch(bt) {
5903     case T_LONG:
5904       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5905       break;
5906     case T_INT:
5907       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5908       break;
5909     case T_CHAR:
5910     case T_SHORT:
5911       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5912       break;
5913     case T_BYTE:
5914     case T_BOOLEAN:
5915       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5916       break;
5917     default:
5918       fatal("Unsupported type %s", type2name(bt));
5919       break;
5920   }
5921 }
5922 
5923 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5924                                                       KRegister mask, bool merge, int vec_enc) {
5925   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5926   switch(bt) {
5927     case T_LONG:
5928       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5929       evpopcntq(dst, mask, src, merge, vec_enc);
5930       break;
5931     case T_INT:
5932       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5933       evpopcntd(dst, mask, src, merge, vec_enc);
5934       break;
5935     case T_CHAR:
5936     case T_SHORT:
5937       assert(VM_Version::supports_avx512_bitalg(), "");
5938       evpopcntw(dst, mask, src, merge, vec_enc);
5939       break;
5940     case T_BYTE:
5941     case T_BOOLEAN:
5942       assert(VM_Version::supports_avx512_bitalg(), "");
5943       evpopcntb(dst, mask, src, merge, vec_enc);
5944       break;
5945     default:
5946       fatal("Unsupported type %s", type2name(bt));
5947       break;
5948   }
5949 }
5950 
5951 #ifndef _LP64
5952 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
5953   assert(VM_Version::supports_avx512bw(), "");
5954   kmovdl(tmp, src);
5955   kunpckdql(dst, tmp, tmp);
5956 }
5957 #endif
5958 
5959 // Bit reversal algorithm first reverses the bits of each byte followed by
5960 // a byte level reversal for multi-byte primitive types (short/int/long).
5961 // Algorithm performs a lookup table access to get reverse bit sequence
5962 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5963 // is obtained by swapping the reverse bit sequences of upper and lower
5964 // nibble of a byte.
5965 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5966                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5967   if (VM_Version::supports_avx512vlbw()) {
5968 
5969     // Get the reverse bit sequence of lower nibble of each byte.
5970     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5971     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5972     evpandq(dst, xtmp2, src, vec_enc);
5973     vpshufb(dst, xtmp1, dst, vec_enc);
5974     vpsllq(dst, dst, 4, vec_enc);
5975 
5976     // Get the reverse bit sequence of upper nibble of each byte.
5977     vpandn(xtmp2, xtmp2, src, vec_enc);
5978     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5979     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5980 
5981     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5982     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5983     evporq(xtmp2, dst, xtmp2, vec_enc);
5984     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5985 
5986   } else if(vec_enc == Assembler::AVX_512bit) {
5987     // Shift based bit reversal.
5988     assert(bt == T_LONG || bt == T_INT, "");
5989 
5990     // Swap lower and upper nibble of each byte.
5991     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5992 
5993     // Swap two least and most significant bits of each nibble.
5994     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5995 
5996     // Swap adjacent pair of bits.
5997     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5998     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5999 
6000     evmovdqul(xtmp1, k0, dst, true, vec_enc);
6001     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
6002   } else {
6003     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
6004     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6005 
6006     // Get the reverse bit sequence of lower nibble of each byte.
6007     vpand(dst, xtmp2, src, vec_enc);
6008     vpshufb(dst, xtmp1, dst, vec_enc);
6009     vpsllq(dst, dst, 4, vec_enc);
6010 
6011     // Get the reverse bit sequence of upper nibble of each byte.
6012     vpandn(xtmp2, xtmp2, src, vec_enc);
6013     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6014     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6015 
6016     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6017     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6018     vpor(xtmp2, dst, xtmp2, vec_enc);
6019     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6020   }
6021 }
6022 
6023 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
6024                                                 XMMRegister xtmp, Register rscratch) {
6025   assert(VM_Version::supports_gfni(), "");
6026   assert(rscratch != noreg || always_reachable(mask), "missing");
6027 
6028   // Galois field instruction based bit reversal based on following algorithm.
6029   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6030   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
6031   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
6032   vector_reverse_byte(bt, dst, xtmp, vec_enc);
6033 }
6034 
6035 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
6036                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
6037   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6038   evpandq(dst, xtmp1, src, vec_enc);
6039   vpsllq(dst, dst, nbits, vec_enc);
6040   vpandn(xtmp1, xtmp1, src, vec_enc);
6041   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6042   evporq(dst, dst, xtmp1, vec_enc);
6043 }
6044 
6045 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6046                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
6047   // Shift based bit reversal.
6048   assert(VM_Version::supports_evex(), "");
6049   switch(bt) {
6050     case T_LONG:
6051       // Swap upper and lower double word of each quad word.
6052       evprorq(xtmp1, k0, src, 32, true, vec_enc);
6053       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6054       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6055       break;
6056     case T_INT:
6057       // Swap upper and lower word of each double word.
6058       evprord(xtmp1, k0, src, 16, true, vec_enc);
6059       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6060       break;
6061     case T_CHAR:
6062     case T_SHORT:
6063       // Swap upper and lower byte of each word.
6064       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6065       break;
6066     case T_BYTE:
6067       evmovdquq(dst, k0, src, true, vec_enc);
6068       break;
6069     default:
6070       fatal("Unsupported type %s", type2name(bt));
6071       break;
6072   }
6073 }
6074 
6075 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6076   if (bt == T_BYTE) {
6077     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6078       evmovdquq(dst, k0, src, true, vec_enc);
6079     } else {
6080       vmovdqu(dst, src);
6081     }
6082     return;
6083   }
6084   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6085   // pre-computed shuffle indices.
6086   switch(bt) {
6087     case T_LONG:
6088       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6089       break;
6090     case T_INT:
6091       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6092       break;
6093     case T_CHAR:
6094     case T_SHORT:
6095       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6096       break;
6097     default:
6098       fatal("Unsupported type %s", type2name(bt));
6099       break;
6100   }
6101   vpshufb(dst, src, dst, vec_enc);
6102 }
6103 
6104 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6105                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6106                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6107   assert(is_integral_type(bt), "");
6108   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6109   assert(VM_Version::supports_avx512cd(), "");
6110   switch(bt) {
6111     case T_LONG:
6112       evplzcntq(dst, ktmp, src, merge, vec_enc);
6113       break;
6114     case T_INT:
6115       evplzcntd(dst, ktmp, src, merge, vec_enc);
6116       break;
6117     case T_SHORT:
6118       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6119       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6120       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6121       vpunpckhwd(dst, xtmp1, src, vec_enc);
6122       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6123       vpackusdw(dst, xtmp2, dst, vec_enc);
6124       break;
6125     case T_BYTE:
6126       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6127       // accessing the lookup table.
6128       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6129       // accessing the lookup table.
6130       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6131       assert(VM_Version::supports_avx512bw(), "");
6132       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6133       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6134       vpand(xtmp2, dst, src, vec_enc);
6135       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6136       vpsrlw(xtmp3, src, 4, vec_enc);
6137       vpand(xtmp3, dst, xtmp3, vec_enc);
6138       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6139       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6140       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6141       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6142       break;
6143     default:
6144       fatal("Unsupported type %s", type2name(bt));
6145       break;
6146   }
6147 }
6148 
6149 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6150                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6151   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6152   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6153   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6154   // accessing the lookup table.
6155   vpand(dst, xtmp2, src, vec_enc);
6156   vpshufb(dst, xtmp1, dst, vec_enc);
6157   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6158   // accessing the lookup table.
6159   vpsrlw(xtmp3, src, 4, vec_enc);
6160   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6161   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6162   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6163   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6164   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6165   vpaddb(dst, dst, xtmp2, vec_enc);
6166   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6167 }
6168 
6169 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6170                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6171   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6172   // Add zero counts of lower byte and upper byte of a word if
6173   // upper byte holds a zero value.
6174   vpsrlw(xtmp3, src, 8, vec_enc);
6175   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6176   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6177   vpsllw(xtmp2, dst, 8, vec_enc);
6178   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6179   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6180   vpsrlw(dst, dst, 8, vec_enc);
6181 }
6182 
6183 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6184                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6185   // Since IEEE 754 floating point format represents mantissa in 1.0 format
6186   // hence biased exponent can be used to compute leading zero count as per
6187   // following formula:-
6188   // LZCNT = 32 - (biased_exp - 127)
6189   // Special handling has been introduced for Zero, Max_Int and -ve source values.
6190 
6191   // Broadcast 0xFF
6192   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6193   vpsrld(xtmp1, xtmp1, 24, vec_enc);
6194 
6195   // Extract biased exponent.
6196   vcvtdq2ps(dst, src, vec_enc);
6197   vpsrld(dst, dst, 23, vec_enc);
6198   vpand(dst, dst, xtmp1, vec_enc);
6199 
6200   // Broadcast 127.
6201   vpsrld(xtmp1, xtmp1, 1, vec_enc);
6202   // Exponent = biased_exp - 127
6203   vpsubd(dst, dst, xtmp1, vec_enc);
6204 
6205   // Exponent = Exponent  + 1
6206   vpsrld(xtmp3, xtmp1, 6, vec_enc);
6207   vpaddd(dst, dst, xtmp3, vec_enc);
6208 
6209   // Replace -ve exponent with zero, exponent is -ve when src
6210   // lane contains a zero value.
6211   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6212   vblendvps(dst, dst, xtmp2, dst, vec_enc);
6213 
6214   // Rematerialize broadcast 32.
6215   vpslld(xtmp1, xtmp3, 5, vec_enc);
6216   // Exponent is 32 if corresponding source lane contains max_int value.
6217   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
6218   // LZCNT = 32 - exponent
6219   vpsubd(dst, xtmp1, dst, vec_enc);
6220 
6221   // Replace LZCNT with a value 1 if corresponding source lane
6222   // contains max_int value.
6223   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
6224 
6225   // Replace biased_exp with 0 if source lane value is less than zero.
6226   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6227   vblendvps(dst, dst, xtmp2, src, vec_enc);
6228 }
6229 
6230 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6231                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6232   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6233   // Add zero counts of lower word and upper word of a double word if
6234   // upper word holds a zero value.
6235   vpsrld(xtmp3, src, 16, vec_enc);
6236   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6237   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6238   vpslld(xtmp2, dst, 16, vec_enc);
6239   vpaddd(xtmp2, xtmp2, dst, vec_enc);
6240   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6241   vpsrld(dst, dst, 16, vec_enc);
6242   // Add zero counts of lower doubleword and upper doubleword of a
6243   // quadword if upper doubleword holds a zero value.
6244   vpsrlq(xtmp3, src, 32, vec_enc);
6245   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6246   vpsllq(xtmp2, dst, 32, vec_enc);
6247   vpaddq(xtmp2, xtmp2, dst, vec_enc);
6248   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6249   vpsrlq(dst, dst, 32, vec_enc);
6250 }
6251 
6252 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6253                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6254                                                        Register rtmp, int vec_enc) {
6255   assert(is_integral_type(bt), "unexpected type");
6256   assert(vec_enc < Assembler::AVX_512bit, "");
6257   switch(bt) {
6258     case T_LONG:
6259       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6260       break;
6261     case T_INT:
6262       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6263       break;
6264     case T_SHORT:
6265       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6266       break;
6267     case T_BYTE:
6268       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6269       break;
6270     default:
6271       fatal("Unsupported type %s", type2name(bt));
6272       break;
6273   }
6274 }
6275 
6276 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6277   switch(bt) {
6278     case T_BYTE:
6279       vpsubb(dst, src1, src2, vec_enc);
6280       break;
6281     case T_SHORT:
6282       vpsubw(dst, src1, src2, vec_enc);
6283       break;
6284     case T_INT:
6285       vpsubd(dst, src1, src2, vec_enc);
6286       break;
6287     case T_LONG:
6288       vpsubq(dst, src1, src2, vec_enc);
6289       break;
6290     default:
6291       fatal("Unsupported type %s", type2name(bt));
6292       break;
6293   }
6294 }
6295 
6296 // Trailing zero count computation is based on leading zero count operation as per
6297 // following equation. All AVX3 targets support AVX512CD feature which offers
6298 // direct vector instruction to compute leading zero count.
6299 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6300 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6301                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6302                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6303   assert(is_integral_type(bt), "");
6304   // xtmp = -1
6305   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6306   // xtmp = xtmp + src
6307   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6308   // xtmp = xtmp & ~src
6309   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6310   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6311   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6312   vpsub(bt, dst, xtmp4, dst, vec_enc);
6313 }
6314 
6315 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6316 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6317 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6318                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6319   assert(is_integral_type(bt), "");
6320   // xtmp = 0
6321   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6322   // xtmp = 0 - src
6323   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6324   // xtmp = xtmp | src
6325   vpor(xtmp3, xtmp3, src, vec_enc);
6326   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6327   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6328   vpsub(bt, dst, xtmp1, dst, vec_enc);
6329 }
6330 
6331 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6332   Label done;
6333   Label neg_divisor_fastpath;
6334   cmpl(divisor, 0);
6335   jccb(Assembler::less, neg_divisor_fastpath);
6336   xorl(rdx, rdx);
6337   divl(divisor);
6338   jmpb(done);
6339   bind(neg_divisor_fastpath);
6340   // Fastpath for divisor < 0:
6341   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6342   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6343   movl(rdx, rax);
6344   subl(rdx, divisor);
6345   if (VM_Version::supports_bmi1()) {
6346     andnl(rax, rdx, rax);
6347   } else {
6348     notl(rdx);
6349     andl(rax, rdx);
6350   }
6351   shrl(rax, 31);
6352   bind(done);
6353 }
6354 
6355 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6356   Label done;
6357   Label neg_divisor_fastpath;
6358   cmpl(divisor, 0);
6359   jccb(Assembler::less, neg_divisor_fastpath);
6360   xorl(rdx, rdx);
6361   divl(divisor);
6362   jmpb(done);
6363   bind(neg_divisor_fastpath);
6364   // Fastpath when divisor < 0:
6365   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6366   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6367   movl(rdx, rax);
6368   subl(rax, divisor);
6369   if (VM_Version::supports_bmi1()) {
6370     andnl(rax, rax, rdx);
6371   } else {
6372     notl(rax);
6373     andl(rax, rdx);
6374   }
6375   sarl(rax, 31);
6376   andl(rax, divisor);
6377   subl(rdx, rax);
6378   bind(done);
6379 }
6380 
6381 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6382   Label done;
6383   Label neg_divisor_fastpath;
6384 
6385   cmpl(divisor, 0);
6386   jccb(Assembler::less, neg_divisor_fastpath);
6387   xorl(rdx, rdx);
6388   divl(divisor);
6389   jmpb(done);
6390   bind(neg_divisor_fastpath);
6391   // Fastpath for divisor < 0:
6392   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6393   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6394   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6395   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6396   movl(rdx, rax);
6397   subl(rax, divisor);
6398   if (VM_Version::supports_bmi1()) {
6399     andnl(rax, rax, rdx);
6400   } else {
6401     notl(rax);
6402     andl(rax, rdx);
6403   }
6404   movl(tmp, rax);
6405   shrl(rax, 31); // quotient
6406   sarl(tmp, 31);
6407   andl(tmp, divisor);
6408   subl(rdx, tmp); // remainder
6409   bind(done);
6410 }
6411 
6412 #ifdef _LP64
6413 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6414                                  XMMRegister xtmp2, Register rtmp) {
6415   if(VM_Version::supports_gfni()) {
6416     // Galois field instruction based bit reversal based on following algorithm.
6417     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6418     mov64(rtmp, 0x8040201008040201L);
6419     movq(xtmp1, src);
6420     movq(xtmp2, rtmp);
6421     gf2p8affineqb(xtmp1, xtmp2, 0);
6422     movq(dst, xtmp1);
6423   } else {
6424     // Swap even and odd numbered bits.
6425     movl(rtmp, src);
6426     andl(rtmp, 0x55555555);
6427     shll(rtmp, 1);
6428     movl(dst, src);
6429     andl(dst, 0xAAAAAAAA);
6430     shrl(dst, 1);
6431     orl(dst, rtmp);
6432 
6433     // Swap LSB and MSB 2 bits of each nibble.
6434     movl(rtmp, dst);
6435     andl(rtmp, 0x33333333);
6436     shll(rtmp, 2);
6437     andl(dst, 0xCCCCCCCC);
6438     shrl(dst, 2);
6439     orl(dst, rtmp);
6440 
6441     // Swap LSB and MSB 4 bits of each byte.
6442     movl(rtmp, dst);
6443     andl(rtmp, 0x0F0F0F0F);
6444     shll(rtmp, 4);
6445     andl(dst, 0xF0F0F0F0);
6446     shrl(dst, 4);
6447     orl(dst, rtmp);
6448   }
6449   bswapl(dst);
6450 }
6451 
6452 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6453                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6454   if(VM_Version::supports_gfni()) {
6455     // Galois field instruction based bit reversal based on following algorithm.
6456     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6457     mov64(rtmp1, 0x8040201008040201L);
6458     movq(xtmp1, src);
6459     movq(xtmp2, rtmp1);
6460     gf2p8affineqb(xtmp1, xtmp2, 0);
6461     movq(dst, xtmp1);
6462   } else {
6463     // Swap even and odd numbered bits.
6464     movq(rtmp1, src);
6465     mov64(rtmp2, 0x5555555555555555L);
6466     andq(rtmp1, rtmp2);
6467     shlq(rtmp1, 1);
6468     movq(dst, src);
6469     notq(rtmp2);
6470     andq(dst, rtmp2);
6471     shrq(dst, 1);
6472     orq(dst, rtmp1);
6473 
6474     // Swap LSB and MSB 2 bits of each nibble.
6475     movq(rtmp1, dst);
6476     mov64(rtmp2, 0x3333333333333333L);
6477     andq(rtmp1, rtmp2);
6478     shlq(rtmp1, 2);
6479     notq(rtmp2);
6480     andq(dst, rtmp2);
6481     shrq(dst, 2);
6482     orq(dst, rtmp1);
6483 
6484     // Swap LSB and MSB 4 bits of each byte.
6485     movq(rtmp1, dst);
6486     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6487     andq(rtmp1, rtmp2);
6488     shlq(rtmp1, 4);
6489     notq(rtmp2);
6490     andq(dst, rtmp2);
6491     shrq(dst, 4);
6492     orq(dst, rtmp1);
6493   }
6494   bswapq(dst);
6495 }
6496 
6497 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6498   Label done;
6499   Label neg_divisor_fastpath;
6500   cmpq(divisor, 0);
6501   jccb(Assembler::less, neg_divisor_fastpath);
6502   xorl(rdx, rdx);
6503   divq(divisor);
6504   jmpb(done);
6505   bind(neg_divisor_fastpath);
6506   // Fastpath for divisor < 0:
6507   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6508   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6509   movq(rdx, rax);
6510   subq(rdx, divisor);
6511   if (VM_Version::supports_bmi1()) {
6512     andnq(rax, rdx, rax);
6513   } else {
6514     notq(rdx);
6515     andq(rax, rdx);
6516   }
6517   shrq(rax, 63);
6518   bind(done);
6519 }
6520 
6521 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6522   Label done;
6523   Label neg_divisor_fastpath;
6524   cmpq(divisor, 0);
6525   jccb(Assembler::less, neg_divisor_fastpath);
6526   xorq(rdx, rdx);
6527   divq(divisor);
6528   jmp(done);
6529   bind(neg_divisor_fastpath);
6530   // Fastpath when divisor < 0:
6531   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6532   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6533   movq(rdx, rax);
6534   subq(rax, divisor);
6535   if (VM_Version::supports_bmi1()) {
6536     andnq(rax, rax, rdx);
6537   } else {
6538     notq(rax);
6539     andq(rax, rdx);
6540   }
6541   sarq(rax, 63);
6542   andq(rax, divisor);
6543   subq(rdx, rax);
6544   bind(done);
6545 }
6546 
6547 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6548   Label done;
6549   Label neg_divisor_fastpath;
6550   cmpq(divisor, 0);
6551   jccb(Assembler::less, neg_divisor_fastpath);
6552   xorq(rdx, rdx);
6553   divq(divisor);
6554   jmp(done);
6555   bind(neg_divisor_fastpath);
6556   // Fastpath for divisor < 0:
6557   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6558   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6559   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6560   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6561   movq(rdx, rax);
6562   subq(rax, divisor);
6563   if (VM_Version::supports_bmi1()) {
6564     andnq(rax, rax, rdx);
6565   } else {
6566     notq(rax);
6567     andq(rax, rdx);
6568   }
6569   movq(tmp, rax);
6570   shrq(rax, 63); // quotient
6571   sarq(tmp, 63);
6572   andq(tmp, divisor);
6573   subq(rdx, tmp); // remainder
6574   bind(done);
6575 }
6576 #endif
6577 
6578 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6579                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6580                                         int vlen_enc) {
6581   assert(VM_Version::supports_avx512bw(), "");
6582   // Byte shuffles are inlane operations and indices are determined using
6583   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6584   // normalized to index range 0-15. This makes sure that all the multiples
6585   // of an index value are placed at same relative position in 128 bit
6586   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6587   // will be 16th element in their respective 128 bit lanes.
6588   movl(rtmp, 16);
6589   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6590 
6591   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6592   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6593   // original shuffle indices and move the shuffled lanes corresponding to true
6594   // mask to destination vector.
6595   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6596   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6597   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6598 
6599   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6600   // and broadcasting second 128 bit lane.
6601   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6602   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6603   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6604   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6605   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6606 
6607   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6608   // and broadcasting third 128 bit lane.
6609   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6610   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6611   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6612   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6613   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6614 
6615   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6616   // and broadcasting third 128 bit lane.
6617   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6618   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6619   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6620   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6621   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6622 }
6623 
6624 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6625                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6626   if (vlen_enc == AVX_128bit) {
6627     vpermilps(dst, src, shuffle, vlen_enc);
6628   } else if (bt == T_INT) {
6629     vpermd(dst, shuffle, src, vlen_enc);
6630   } else {
6631     assert(bt == T_FLOAT, "");
6632     vpermps(dst, shuffle, src, vlen_enc);
6633   }
6634 }