1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/globals.hpp"
  37 #include "runtime/objectMonitor.hpp"
  38 #include "runtime/stubRoutines.hpp"
  39 #include "utilities/checkedCast.hpp"
  40 #include "utilities/globalDefinitions.hpp"
  41 #include "utilities/powerOfTwo.hpp"
  42 #include "utilities/sizes.hpp"
  43 
  44 #ifdef PRODUCT
  45 #define BLOCK_COMMENT(str) /* nothing */
  46 #define STOP(error) stop(error)
  47 #else
  48 #define BLOCK_COMMENT(str) block_comment(str)
  49 #define STOP(error) block_comment(error); stop(error)
  50 #endif
  51 
  52 // C2 compiled method's prolog code.
  53 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  54 
  55   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  56   // NativeJump::patch_verified_entry will be able to patch out the entry
  57   // code safely. The push to verify stack depth is ok at 5 bytes,
  58   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  59   // stack bang then we must use the 6 byte frame allocation even if
  60   // we have no frame. :-(
  61   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  62 
  63   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  64   // Remove word for return addr
  65   framesize -= wordSize;
  66   stack_bang_size -= wordSize;
  67 
  68   // Calls to C2R adapters often do not accept exceptional returns.
  69   // We require that their callers must bang for them.  But be careful, because
  70   // some VM calls (such as call site linkage) can use several kilobytes of
  71   // stack.  But the stack safety zone should account for that.
  72   // See bugs 4446381, 4468289, 4497237.
  73   if (stack_bang_size > 0) {
  74     generate_stack_overflow_check(stack_bang_size);
  75 
  76     // We always push rbp, so that on return to interpreter rbp, will be
  77     // restored correctly and we can correct the stack.
  78     push(rbp);
  79     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  80     if (PreserveFramePointer) {
  81       mov(rbp, rsp);
  82     }
  83     // Remove word for ebp
  84     framesize -= wordSize;
  85 
  86     // Create frame
  87     if (framesize) {
  88       subptr(rsp, framesize);
  89     }
  90   } else {
  91     // Create frame (force generation of a 4 byte immediate value)
  92     subptr_imm32(rsp, framesize);
  93 
  94     // Save RBP register now.
  95     framesize -= wordSize;
  96     movptr(Address(rsp, framesize), rbp);
  97     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  98     if (PreserveFramePointer) {
  99       movptr(rbp, rsp);
 100       if (framesize > 0) {
 101         addptr(rbp, framesize);
 102       }
 103     }
 104   }
 105 
 106   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 107     framesize -= wordSize;
 108     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 109   }
 110 
 111 #ifndef _LP64
 112   // If method sets FPU control word do it now
 113   if (fp_mode_24b) {
 114     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 115   }
 116   if (UseSSE >= 2 && VerifyFPU) {
 117     verify_FPU(0, "FPU stack must be clean on entry");
 118   }
 119 #endif
 120 
 121 #ifdef ASSERT
 122   if (VerifyStackAtCalls) {
 123     Label L;
 124     push(rax);
 125     mov(rax, rsp);
 126     andptr(rax, StackAlignmentInBytes-1);
 127     cmpptr(rax, StackAlignmentInBytes-wordSize);
 128     pop(rax);
 129     jcc(Assembler::equal, L);
 130     STOP("Stack is not properly aligned!");
 131     bind(L);
 132   }
 133 #endif
 134 
 135   if (!is_stub) {
 136     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 137  #ifdef _LP64
 138     if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
 139       // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 140       Label dummy_slow_path;
 141       Label dummy_continuation;
 142       Label* slow_path = &dummy_slow_path;
 143       Label* continuation = &dummy_continuation;
 144       if (!Compile::current()->output()->in_scratch_emit_size()) {
 145         // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 146         C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 147         Compile::current()->output()->add_stub(stub);
 148         slow_path = &stub->entry();
 149         continuation = &stub->continuation();
 150       }
 151       bs->nmethod_entry_barrier(this, slow_path, continuation);
 152     }
 153 #else
 154     // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 155     bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */);
 156 #endif
 157   }
 158 }
 159 
 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 161   switch (vlen_in_bytes) {
 162     case  4: // fall-through
 163     case  8: // fall-through
 164     case 16: return Assembler::AVX_128bit;
 165     case 32: return Assembler::AVX_256bit;
 166     case 64: return Assembler::AVX_512bit;
 167 
 168     default: {
 169       ShouldNotReachHere();
 170       return Assembler::AVX_NoVec;
 171     }
 172   }
 173 }
 174 
 175 #if INCLUDE_RTM_OPT
 176 
 177 // Update rtm_counters based on abort status
 178 // input: abort_status
 179 //        rtm_counters (RTMLockingCounters*)
 180 // flags are killed
 181 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
 182 
 183   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
 184   if (PrintPreciseRTMLockingStatistics) {
 185     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
 186       Label check_abort;
 187       testl(abort_status, (1<<i));
 188       jccb(Assembler::equal, check_abort);
 189       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
 190       bind(check_abort);
 191     }
 192   }
 193 }
 194 
 195 // Branch if (random & (count-1) != 0), count is 2^n
 196 // tmp, scr and flags are killed
 197 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
 198   assert(tmp == rax, "");
 199   assert(scr == rdx, "");
 200   rdtsc(); // modifies EDX:EAX
 201   andptr(tmp, count-1);
 202   jccb(Assembler::notZero, brLabel);
 203 }
 204 
 205 // Perform abort ratio calculation, set no_rtm bit if high ratio
 206 // input:  rtm_counters_Reg (RTMLockingCounters* address)
 207 // tmpReg, rtm_counters_Reg and flags are killed
 208 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
 209                                                     Register rtm_counters_Reg,
 210                                                     RTMLockingCounters* rtm_counters,
 211                                                     Metadata* method_data) {
 212   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 213 
 214   if (RTMLockingCalculationDelay > 0) {
 215     // Delay calculation
 216     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()));
 217     testptr(tmpReg, tmpReg);
 218     jccb(Assembler::equal, L_done);
 219   }
 220   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 221   //   Aborted transactions = abort_count * 100
 222   //   All transactions = total_count *  RTMTotalCountIncrRate
 223   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 224 
 225   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 226   cmpptr(tmpReg, RTMAbortThreshold);
 227   jccb(Assembler::below, L_check_always_rtm2);
 228   imulptr(tmpReg, tmpReg, 100);
 229 
 230   Register scrReg = rtm_counters_Reg;
 231   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 232   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 233   imulptr(scrReg, scrReg, RTMAbortRatio);
 234   cmpptr(tmpReg, scrReg);
 235   jccb(Assembler::below, L_check_always_rtm1);
 236   if (method_data != nullptr) {
 237     // set rtm_state to "no rtm" in MDO
 238     mov_metadata(tmpReg, method_data);
 239     lock();
 240     orl(Address(tmpReg, MethodData::rtm_state_offset()), NoRTM);
 241   }
 242   jmpb(L_done);
 243   bind(L_check_always_rtm1);
 244   // Reload RTMLockingCounters* address
 245   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 246   bind(L_check_always_rtm2);
 247   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 248   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 249   jccb(Assembler::below, L_done);
 250   if (method_data != nullptr) {
 251     // set rtm_state to "always rtm" in MDO
 252     mov_metadata(tmpReg, method_data);
 253     lock();
 254     orl(Address(tmpReg, MethodData::rtm_state_offset()), UseRTM);
 255   }
 256   bind(L_done);
 257 }
 258 
 259 // Update counters and perform abort ratio calculation
 260 // input:  abort_status_Reg
 261 // rtm_counters_Reg, flags are killed
 262 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 263                                       Register rtm_counters_Reg,
 264                                       RTMLockingCounters* rtm_counters,
 265                                       Metadata* method_data,
 266                                       bool profile_rtm) {
 267 
 268   assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 269   // update rtm counters based on rax value at abort
 270   // reads abort_status_Reg, updates flags
 271   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 272   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 273   if (profile_rtm) {
 274     // Save abort status because abort_status_Reg is used by following code.
 275     if (RTMRetryCount > 0) {
 276       push(abort_status_Reg);
 277     }
 278     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 279     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 280     // restore abort status
 281     if (RTMRetryCount > 0) {
 282       pop(abort_status_Reg);
 283     }
 284   }
 285 }
 286 
 287 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 288 // inputs: retry_count_Reg
 289 //       : abort_status_Reg
 290 // output: retry_count_Reg decremented by 1
 291 // flags are killed
 292 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 293   Label doneRetry;
 294   assert(abort_status_Reg == rax, "");
 295   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 296   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 297   // if reason is in 0x6 and retry count != 0 then retry
 298   andptr(abort_status_Reg, 0x6);
 299   jccb(Assembler::zero, doneRetry);
 300   testl(retry_count_Reg, retry_count_Reg);
 301   jccb(Assembler::zero, doneRetry);
 302   pause();
 303   decrementl(retry_count_Reg);
 304   jmp(retryLabel);
 305   bind(doneRetry);
 306 }
 307 
 308 // Spin and retry if lock is busy,
 309 // inputs: box_Reg (monitor address)
 310 //       : retry_count_Reg
 311 // output: retry_count_Reg decremented by 1
 312 //       : clear z flag if retry count exceeded
 313 // tmp_Reg, scr_Reg, flags are killed
 314 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 315                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 316   Label SpinLoop, SpinExit, doneRetry;
 317   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 318 
 319   testl(retry_count_Reg, retry_count_Reg);
 320   jccb(Assembler::zero, doneRetry);
 321   decrementl(retry_count_Reg);
 322   movptr(scr_Reg, RTMSpinLoopCount);
 323 
 324   bind(SpinLoop);
 325   pause();
 326   decrementl(scr_Reg);
 327   jccb(Assembler::lessEqual, SpinExit);
 328   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 329   testptr(tmp_Reg, tmp_Reg);
 330   jccb(Assembler::notZero, SpinLoop);
 331 
 332   bind(SpinExit);
 333   jmp(retryLabel);
 334   bind(doneRetry);
 335   incrementl(retry_count_Reg); // clear z flag
 336 }
 337 
 338 // Use RTM for normal stack locks
 339 // Input: objReg (object to lock)
 340 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 341                                          Register retry_on_abort_count_Reg,
 342                                          RTMLockingCounters* stack_rtm_counters,
 343                                          Metadata* method_data, bool profile_rtm,
 344                                          Label& DONE_LABEL, Label& IsInflated) {
 345   assert(UseRTMForStackLocks, "why call this otherwise?");
 346   assert(tmpReg == rax, "");
 347   assert(scrReg == rdx, "");
 348   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 349 
 350   if (RTMRetryCount > 0) {
 351     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 352     bind(L_rtm_retry);
 353   }
 354   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 355   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral
 356   jcc(Assembler::notZero, IsInflated);
 357 
 358   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 359     Label L_noincrement;
 360     if (RTMTotalCountIncrRate > 1) {
 361       // tmpReg, scrReg and flags are killed
 362       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 363     }
 364     assert(stack_rtm_counters != nullptr, "should not be null when profiling RTM");
 365     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 366     bind(L_noincrement);
 367   }
 368   xbegin(L_on_abort);
 369   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 370   andptr(tmpReg, markWord::lock_mask_in_place);     // look at 2 lock bits
 371   cmpptr(tmpReg, markWord::unlocked_value);         // bits = 01 unlocked
 372   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 373 
 374   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 375   if (UseRTMXendForLockBusy) {
 376     xend();
 377     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 378     jmp(L_decrement_retry);
 379   }
 380   else {
 381     xabort(0);
 382   }
 383   bind(L_on_abort);
 384   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 385     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 386   }
 387   bind(L_decrement_retry);
 388   if (RTMRetryCount > 0) {
 389     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 390     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 391   }
 392 }
 393 
 394 // Use RTM for inflating locks
 395 // inputs: objReg (object to lock)
 396 //         boxReg (on-stack box address (displaced header location) - KILLED)
 397 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 398 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 399                                             Register scrReg, Register retry_on_busy_count_Reg,
 400                                             Register retry_on_abort_count_Reg,
 401                                             RTMLockingCounters* rtm_counters,
 402                                             Metadata* method_data, bool profile_rtm,
 403                                             Label& DONE_LABEL) {
 404   assert(UseRTMLocking, "why call this otherwise?");
 405   assert(tmpReg == rax, "");
 406   assert(scrReg == rdx, "");
 407   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 408   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 409 
 410   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 411   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 412 
 413   if (RTMRetryCount > 0) {
 414     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 415     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 416     bind(L_rtm_retry);
 417   }
 418   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 419     Label L_noincrement;
 420     if (RTMTotalCountIncrRate > 1) {
 421       // tmpReg, scrReg and flags are killed
 422       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 423     }
 424     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 425     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 426     bind(L_noincrement);
 427   }
 428   xbegin(L_on_abort);
 429   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 430   movptr(tmpReg, Address(tmpReg, owner_offset));
 431   testptr(tmpReg, tmpReg);
 432   jcc(Assembler::zero, DONE_LABEL);
 433   if (UseRTMXendForLockBusy) {
 434     xend();
 435     jmp(L_decrement_retry);
 436   }
 437   else {
 438     xabort(0);
 439   }
 440   bind(L_on_abort);
 441   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 442   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 443     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 444   }
 445   if (RTMRetryCount > 0) {
 446     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 447     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 448   }
 449 
 450   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 451   testptr(tmpReg, tmpReg) ;
 452   jccb(Assembler::notZero, L_decrement_retry) ;
 453 
 454   // Appears unlocked - try to swing _owner from null to non-null.
 455   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 456 #ifdef _LP64
 457   Register threadReg = r15_thread;
 458 #else
 459   get_thread(scrReg);
 460   Register threadReg = scrReg;
 461 #endif
 462   movptr(scrReg, Address(threadReg, JavaThread::lock_id_offset()));
 463   lock();
 464   cmpxchgptr(scrReg, Address(boxReg, owner_offset)); // Updates tmpReg
 465 
 466   if (RTMRetryCount > 0) {
 467     // success done else retry
 468     jccb(Assembler::equal, DONE_LABEL) ;
 469     bind(L_decrement_retry);
 470     // Spin and retry if lock is busy.
 471     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 472     jmp(DONE_LABEL);
 473   }
 474   else {
 475     bind(L_decrement_retry);
 476     jmp(DONE_LABEL);
 477   }
 478 }
 479 
 480 #endif //  INCLUDE_RTM_OPT
 481 
 482 // fast_lock and fast_unlock used by C2
 483 
 484 // Because the transitions from emitted code to the runtime
 485 // monitorenter/exit helper stubs are so slow it's critical that
 486 // we inline both the stack-locking fast path and the inflated fast path.
 487 //
 488 // See also: cmpFastLock and cmpFastUnlock.
 489 //
 490 // What follows is a specialized inline transliteration of the code
 491 // in enter() and exit(). If we're concerned about I$ bloat another
 492 // option would be to emit TrySlowEnter and TrySlowExit methods
 493 // at startup-time.  These methods would accept arguments as
 494 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 495 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 496 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 497 // In practice, however, the # of lock sites is bounded and is usually small.
 498 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 499 // if the processor uses simple bimodal branch predictors keyed by EIP
 500 // Since the helper routines would be called from multiple synchronization
 501 // sites.
 502 //
 503 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 504 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 505 // to those specialized methods.  That'd give us a mostly platform-independent
 506 // implementation that the JITs could optimize and inline at their pleasure.
 507 // Done correctly, the only time we'd need to cross to native could would be
 508 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 509 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 510 // (b) explicit barriers or fence operations.
 511 //
 512 // TODO:
 513 //
 514 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 515 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 516 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 517 //    the lock operators would typically be faster than reifying Self.
 518 //
 519 // *  Ideally I'd define the primitives as:
 520 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 521 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 522 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 523 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 524 //    Furthermore the register assignments are overconstrained, possibly resulting in
 525 //    sub-optimal code near the synchronization site.
 526 //
 527 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 528 //    Alternately, use a better sp-proximity test.
 529 //
 530 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 531 //    Either one is sufficient to uniquely identify a thread.
 532 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 533 //
 534 // *  Intrinsify notify() and notifyAll() for the common cases where the
 535 //    object is locked by the calling thread but the waitlist is empty.
 536 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 537 //
 538 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 539 //    But beware of excessive branch density on AMD Opterons.
 540 //
 541 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 542 //    or failure of the fast path.  If the fast path fails then we pass
 543 //    control to the slow path, typically in C.  In fast_lock and
 544 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 545 //    will emit a conditional branch immediately after the node.
 546 //    So we have branches to branches and lots of ICC.ZF games.
 547 //    Instead, it might be better to have C2 pass a "FailureLabel"
 548 //    into fast_lock and fast_unlock.  In the case of success, control
 549 //    will drop through the node.  ICC.ZF is undefined at exit.
 550 //    In the case of failure, the node will branch directly to the
 551 //    FailureLabel
 552 
 553 
 554 // obj: object to lock
 555 // box: on-stack box address (displaced header location) - KILLED
 556 // rax,: tmp -- KILLED
 557 // scr: tmp -- KILLED
 558 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 559                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 560                                  RTMLockingCounters* rtm_counters,
 561                                  RTMLockingCounters* stack_rtm_counters,
 562                                  Metadata* method_data,
 563                                  bool use_rtm, bool profile_rtm) {
 564   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 565   // Ensure the register assignments are disjoint
 566   assert(tmpReg == rax, "");
 567 
 568   if (use_rtm) {
 569     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 570   } else {
 571     assert(cx1Reg == noreg, "");
 572     assert(cx2Reg == noreg, "");
 573     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 574   }
 575 
 576   // Possible cases that we'll encounter in fast_lock
 577   // ------------------------------------------------
 578   // * Inflated
 579   //    -- unlocked
 580   //    -- Locked
 581   //       = by self
 582   //       = by other
 583   // * neutral
 584   // * stack-locked
 585   //    -- by self
 586   //       = sp-proximity test hits
 587   //       = sp-proximity test generates false-negative
 588   //    -- by other
 589   //
 590 
 591   Label IsInflated, DONE_LABEL, COUNT;
 592 
 593   if (DiagnoseSyncOnValueBasedClasses != 0) {
 594     load_klass(tmpReg, objReg, scrReg);
 595     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 596     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 597     jcc(Assembler::notZero, DONE_LABEL);
 598   }
 599 
 600 #if INCLUDE_RTM_OPT
 601   if (UseRTMForStackLocks && use_rtm) {
 602     assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive");
 603     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 604                       stack_rtm_counters, method_data, profile_rtm,
 605                       DONE_LABEL, IsInflated);
 606   }
 607 #endif // INCLUDE_RTM_OPT
 608 
 609   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 610   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 611   jcc(Assembler::notZero, IsInflated);
 612 
 613   if (LockingMode == LM_MONITOR) {
 614     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 615     testptr(objReg, objReg);
 616   } else {
 617     assert(LockingMode == LM_LEGACY, "must be");
 618     // Attempt stack-locking ...
 619     orptr (tmpReg, markWord::unlocked_value);
 620     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 621     lock();
 622     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 623     jcc(Assembler::equal, COUNT);           // Success
 624 
 625     // Recursive locking.
 626     // The object is stack-locked: markword contains stack pointer to BasicLock.
 627     // Locked by current thread if difference with current SP is less than one page.
 628     subptr(tmpReg, rsp);
 629     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 630     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) );
 631     movptr(Address(boxReg, 0), tmpReg);
 632   }
 633   // After recursive stack locking attempt case
 634   jmp(DONE_LABEL);
 635 
 636   bind(IsInflated);
 637   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 638 
 639 #if INCLUDE_RTM_OPT
 640   // Use the same RTM locking code in 32- and 64-bit VM.
 641   if (use_rtm) {
 642     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 643                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 644   } else {
 645 #endif // INCLUDE_RTM_OPT
 646 
 647 #ifndef _LP64
 648   // The object is inflated.
 649 
 650   // boxReg refers to the on-stack BasicLock in the current frame.
 651   // We'd like to write:
 652   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 653   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 654   // additional latency as we have another ST in the store buffer that must drain.
 655 
 656   // avoid ST-before-CAS
 657   // register juggle because we need tmpReg for cmpxchgptr below
 658   movptr(scrReg, boxReg);
 659   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 660 
 661   // Optimistic form: consider XORL tmpReg,tmpReg
 662   movptr(tmpReg, NULL_WORD);
 663 
 664   // Appears unlocked - try to swing _owner from null to non-null.
 665   // Ideally, I'd manifest "Self" with get_thread and then attempt
 666   // to CAS the register containing thread id into m->Owner.
 667   // But we don't have enough registers, so instead we can either try to CAS
 668   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 669   // we later store thread id into m->Owner.  Transiently storing a stack address
 670   // (rsp or the address of the box) into  m->owner is harmless.
 671   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 672   lock();
 673   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 674   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 675   // If we weren't able to swing _owner from null to the BasicLock
 676   // then take the slow path.
 677   jccb  (Assembler::notZero, DONE_LABEL);
 678   // update _owner from BasicLock to thread
 679   get_thread (scrReg);                    // beware: clobbers ICCs
 680   movptr(scrReg, Address(scrReg, JavaThread::lock_id_offset()));
 681   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 682   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 683   jmp(DONE_LABEL);
 684 
 685   // If the CAS fails we can either retry or pass control to the slow path.
 686   // We use the latter tactic.
 687   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 688   // If the CAS was successful ...
 689   //   Self has acquired the lock
 690   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 691   // Intentional fall-through into DONE_LABEL ...
 692 #else // _LP64
 693   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 694   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 695   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 696 
 697   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 698   movq(scrReg, tmpReg);
 699   xorq(tmpReg, tmpReg);
 700   movptr(boxReg, Address(r15_thread, JavaThread::lock_id_offset()));
 701   lock();
 702   cmpxchgptr(boxReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 703 
 704   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 705   jccb(Assembler::equal, DONE_LABEL);    // CAS above succeeded; propagate ZF = 1 (success)
 706 
 707   cmpptr(boxReg, rax);                // Check if we are already the owner (recursive lock)
 708   jccb(Assembler::notEqual, DONE_LABEL);    // If not recursive, ZF = 0 at this point (fail)
 709   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 710   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 711   jmp(DONE_LABEL);
 712 #endif // _LP64
 713 #if INCLUDE_RTM_OPT
 714   } // use_rtm()
 715 #endif
 716 
 717   bind(COUNT);
 718   // Count monitors in fast path
 719   increment(Address(thread, JavaThread::held_monitor_count_offset()));
 720   xorl(tmpReg, tmpReg); // Set ZF == 1
 721 
 722   bind(DONE_LABEL);
 723 
 724   // At DONE_LABEL the icc ZFlag is set as follows ...
 725   // fast_unlock uses the same protocol.
 726   // ZFlag == 1 -> Success
 727   // ZFlag == 0 -> Failure - force control through the slow path
 728 }
 729 
 730 // obj: object to unlock
 731 // box: box address (displaced header location), killed.  Must be EAX.
 732 // tmp: killed, cannot be obj nor box.
 733 //
 734 // Some commentary on balanced locking:
 735 //
 736 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 737 // Methods that don't have provably balanced locking are forced to run in the
 738 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 739 // The interpreter provides two properties:
 740 // I1:  At return-time the interpreter automatically and quietly unlocks any
 741 //      objects acquired the current activation (frame).  Recall that the
 742 //      interpreter maintains an on-stack list of locks currently held by
 743 //      a frame.
 744 // I2:  If a method attempts to unlock an object that is not held by the
 745 //      the frame the interpreter throws IMSX.
 746 //
 747 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 748 // B() doesn't have provably balanced locking so it runs in the interpreter.
 749 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 750 // is still locked by A().
 751 //
 752 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 753 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 754 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 755 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 756 // Arguably given that the spec legislates the JNI case as undefined our implementation
 757 // could reasonably *avoid* checking owner in fast_unlock().
 758 // In the interest of performance we elide m->Owner==Self check in unlock.
 759 // A perfectly viable alternative is to elide the owner check except when
 760 // Xcheck:jni is enabled.
 761 
 762 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, Register scrReg, bool use_rtm) {
 763   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 764   assert(boxReg == rax, "");
 765   assert_different_registers(objReg, boxReg, tmpReg);
 766 
 767   Label DONE_LABEL, Stacked, COUNT;
 768 
 769 #if INCLUDE_RTM_OPT
 770   if (UseRTMForStackLocks && use_rtm) {
 771     assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive");
 772     Label L_regular_unlock;
 773     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 774     andptr(tmpReg, markWord::lock_mask_in_place);                     // look at 2 lock bits
 775     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 01 unlocked
 776     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 777     xend();                                                           // otherwise end...
 778     jmp(DONE_LABEL);                                                  // ... and we're done
 779     bind(L_regular_unlock);
 780   }
 781 #endif
 782 
 783   if (LockingMode == LM_LEGACY) {
 784     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 785     jcc   (Assembler::zero, DONE_LABEL);                              // 0 indicates recursive stack-lock
 786   }
 787   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 788   if (LockingMode != LM_MONITOR) {
 789     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 790     jcc(Assembler::zero, Stacked);
 791   }
 792 
 793   // It's inflated.
 794   // If the owner is ANONYMOUS, we need to fix it -  in an outline stub.
 795   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t) ObjectMonitor::ANONYMOUS_OWNER);
 796 #ifdef _LP64
 797   if (!Compile::current()->output()->in_scratch_emit_size()) {
 798     C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmpReg, boxReg);
 799     Compile::current()->output()->add_stub(stub);
 800     jcc(Assembler::equal, stub->entry());
 801     bind(stub->continuation());
 802   } else
 803 #endif
 804   {
 805     // We can't easily implement this optimization on 32 bit because we don't have a thread register.
 806     // Call the slow-path instead.
 807     jcc(Assembler::notEqual, DONE_LABEL);
 808   }
 809 
 810 #if INCLUDE_RTM_OPT
 811   if (use_rtm) {
 812     Label L_regular_inflated_unlock;
 813     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 814     movptr(boxReg, Address(tmpReg, owner_offset));
 815     testptr(boxReg, boxReg);
 816     jccb(Assembler::notZero, L_regular_inflated_unlock);
 817     xend();
 818     jmp(DONE_LABEL);
 819     bind(L_regular_inflated_unlock);
 820   }
 821 #endif
 822 
 823   // Despite our balanced locking property we still check that m->_owner == Self
 824   // as java routines or native JNI code called by this thread might
 825   // have released the lock.
 826   // Refer to the comments in synchronizer.cpp for how we might encode extra
 827   // state in _succ so we can avoid fetching EntryList|cxq.
 828   //
 829   // If there's no contention try a 1-0 exit.  That is, exit without
 830   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 831   // we detect and recover from the race that the 1-0 exit admits.
 832   //
 833   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 834   // before it STs null into _owner, releasing the lock.  Updates
 835   // to data protected by the critical section must be visible before
 836   // we drop the lock (and thus before any other thread could acquire
 837   // the lock and observe the fields protected by the lock).
 838   // IA32's memory-model is SPO, so STs are ordered with respect to
 839   // each other and there's no need for an explicit barrier (fence).
 840   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 841 #ifndef _LP64
 842   // Note that we could employ various encoding schemes to reduce
 843   // the number of loads below (currently 4) to just 2 or 3.
 844   // Refer to the comments in synchronizer.cpp.
 845   // In practice the chain of fetches doesn't seem to impact performance, however.
 846   xorptr(boxReg, boxReg);
 847   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 848   jccb  (Assembler::notZero, DONE_LABEL);
 849   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 850   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 851   jccb  (Assembler::notZero, DONE_LABEL);
 852   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 853   jmpb  (DONE_LABEL);
 854 #else // _LP64
 855   // It's inflated
 856   Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath;
 857 
 858   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 859   jccb(Assembler::equal, LNotRecursive);
 860 
 861   // Recursive inflated unlock
 862   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 863   xorl(tmpReg, tmpReg); // Set ZF == 1
 864   jmp(DONE_LABEL);
 865 
 866   bind(LNotRecursive);
 867 
 868   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 869   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 870   jccb  (Assembler::notZero, CheckSucc);
 871   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 872   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 873   jmpb  (DONE_LABEL);
 874 
 875   // Try to avoid passing control into the slow_path ...
 876   bind  (CheckSucc);
 877 
 878   // The following optional optimization can be elided if necessary
 879   // Effectively: if (succ == null) goto slow path
 880   // The code reduces the window for a race, however,
 881   // and thus benefits performance.
 882   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 883   jccb  (Assembler::zero, LGoSlowPath);
 884 
 885   xorptr(boxReg, boxReg);
 886   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 887   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 888 
 889   // Memory barrier/fence
 890   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 891   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 892   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 893   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 894   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 895   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 896   lock(); addl(Address(rsp, 0), 0);
 897 
 898   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 899   jccb  (Assembler::notZero, LSuccess);
 900 
 901   // Rare inopportune interleaving - race.
 902   // The successor vanished in the small window above.
 903   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 904   // We need to ensure progress and succession.
 905   // Try to reacquire the lock.
 906   // If that fails then the new owner is responsible for succession and this
 907   // thread needs to take no further action and can exit via the fast path (success).
 908   // If the re-acquire succeeds then pass control into the slow path.
 909   // As implemented, this latter mode is horrible because we generated more
 910   // coherence traffic on the lock *and* artificially extended the critical section
 911   // length while by virtue of passing control into the slow path.
 912 
 913   // box is really RAX -- the following CMPXCHG depends on that binding
 914   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 915   movptr(scrReg, Address(r15_thread, JavaThread::lock_id_offset()));
 916   lock();
 917   cmpxchgptr(scrReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 918   // There's no successor so we tried to regrab the lock.
 919   // If that didn't work, then another thread grabbed the
 920   // lock so we're done (and exit was a success).
 921   jccb  (Assembler::notEqual, LSuccess);
 922   // Intentional fall-through into slow path
 923 
 924   bind  (LGoSlowPath);
 925   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 926   jmpb  (DONE_LABEL);
 927 
 928   bind  (LSuccess);
 929   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 930   jmpb  (DONE_LABEL);
 931 
 932 #endif
 933   if (LockingMode == LM_LEGACY) {
 934     bind  (Stacked);
 935     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 936     lock();
 937     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 938     jccb(Assembler::notZero, DONE_LABEL);
 939     // Count monitors in fast path
 940 #ifndef _LP64
 941     get_thread(tmpReg);
 942     decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 943 #else // _LP64
 944     decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 945 #endif
 946     xorl(tmpReg, tmpReg); // Set ZF == 1
 947   }
 948 
 949   // ZFlag == 1 -> Success
 950   // ZFlag == 0 -> Failure - force control through the slow path
 951   bind(DONE_LABEL);
 952 }
 953 
 954 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 955                                               Register t, Register thread) {
 956   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 957   assert(rax_reg == rax, "Used for CAS");
 958   assert_different_registers(obj, box, rax_reg, t, thread);
 959 
 960   // Handle inflated monitor.
 961   Label inflated;
 962   // Finish fast lock successfully.
 963   Label locked;
 964   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 965   Label slow_path;
 966 
 967   if (DiagnoseSyncOnValueBasedClasses != 0) {
 968     load_klass(rax_reg, obj, t);
 969     movl(rax_reg, Address(rax_reg, Klass::access_flags_offset()));
 970     testl(rax_reg, JVM_ACC_IS_VALUE_BASED_CLASS);
 971     jcc(Assembler::notZero, slow_path);
 972   }
 973 
 974   const Register mark = t;
 975 
 976   { // Lightweight Lock
 977 
 978     Label push;
 979 
 980     const Register top = box;
 981 
 982     // Load the mark.
 983     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 984 
 985     // Prefetch top.
 986     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 987 
 988     // Check for monitor (0b10).
 989     testptr(mark, markWord::monitor_value);
 990     jcc(Assembler::notZero, inflated);
 991 
 992     // Check if lock-stack is full.
 993     cmpl(top, LockStack::end_offset() - 1);
 994     jcc(Assembler::greater, slow_path);
 995 
 996     // Check if recursive.
 997     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 998     jccb(Assembler::equal, push);
 999 
1000     // Try to lock. Transition lock bits 0b01 => 0b00
1001     movptr(rax_reg, mark);
1002     orptr(rax_reg, markWord::unlocked_value);
1003     andptr(mark, ~(int32_t)markWord::unlocked_value);
1004     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
1005     jcc(Assembler::notEqual, slow_path);
1006 
1007     bind(push);
1008     // After successful lock, push object on lock-stack.
1009     movptr(Address(thread, top), obj);
1010     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
1011     xorl(rax_reg, rax_reg);
1012     jmpb(locked);
1013   }
1014 
1015   { // Handle inflated monitor.
1016     bind(inflated);
1017 
1018     const Register tagged_monitor = mark;
1019 
1020     // CAS owner (null => current thread).
1021     xorptr(rax_reg, rax_reg);
1022     movptr(box, Address(thread, JavaThread::lock_id_offset()));
1023     lock(); cmpxchgptr(box, Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1024     jccb(Assembler::equal, locked);
1025 
1026     // Check if recursive.
1027     cmpptr(box, rax_reg);
1028     jccb(Assembler::notEqual, slow_path);
1029 
1030     // Recursive.
1031     increment(Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1032     xorl(rax_reg, rax_reg);
1033   }
1034 
1035   bind(locked);
1036 #ifdef ASSERT
1037   // Check that locked label is reached with ZF set.
1038   Label zf_correct;
1039   jccb(Assembler::zero, zf_correct);
1040   stop("Fast Lock ZF != 1");
1041 #endif
1042 
1043   bind(slow_path);
1044 #ifdef ASSERT
1045   // Check that slow_path label is reached with ZF not set.
1046   jccb(Assembler::notZero, zf_correct);
1047   stop("Fast Lock ZF != 0");
1048   bind(zf_correct);
1049 #endif
1050   // C2 uses the value of ZF to determine the continuation.
1051 }
1052 
1053 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t1, Register t2, Register thread) {
1054   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
1055   assert(reg_rax == rax, "Used for CAS");
1056   assert_different_registers(obj, reg_rax, t1, t2);
1057 
1058   // Handle inflated monitor.
1059   Label inflated, inflated_check_lock_stack;
1060   // Finish fast unlock successfully.  MUST jump with ZF == 1
1061   Label unlocked;
1062 
1063   const Register mark = t1;
1064   const Register top = reg_rax;
1065 
1066   Label dummy;
1067   C2FastUnlockLightweightStub* stub = nullptr;
1068 
1069   if (!Compile::current()->output()->in_scratch_emit_size()) {
1070     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, t2, thread);
1071     Compile::current()->output()->add_stub(stub);
1072   }
1073 
1074   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
1075   Label& check_successor = stub == nullptr ? dummy : stub->check_successor();
1076 
1077   { // Lightweight Unlock
1078 
1079     // Load top.
1080     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
1081 
1082     // Prefetch mark.
1083     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
1084 
1085     // Check if obj is top of lock-stack.
1086     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
1087     // Top of lock stack was not obj. Must be monitor.
1088     jcc(Assembler::notEqual, inflated_check_lock_stack);
1089 
1090     // Pop lock-stack.
1091     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
1092     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
1093 
1094     // Check if recursive.
1095     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
1096     jcc(Assembler::equal, unlocked);
1097 
1098     // We elide the monitor check, let the CAS fail instead.
1099 
1100     // Try to unlock. Transition lock bits 0b00 => 0b01
1101     movptr(reg_rax, mark);
1102     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
1103     orptr(mark, markWord::unlocked_value);
1104     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
1105     jcc(Assembler::notEqual, push_and_slow_path);
1106     jmp(unlocked);
1107   }
1108 
1109 
1110   { // Handle inflated monitor.
1111     bind(inflated_check_lock_stack);
1112 #ifdef ASSERT
1113     Label check_done;
1114     subl(top, oopSize);
1115     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
1116     jcc(Assembler::below, check_done);
1117     cmpptr(obj, Address(thread, top));
1118     jccb(Assembler::notEqual, inflated_check_lock_stack);
1119     stop("Fast Unlock lock on stack");
1120     bind(check_done);
1121     testptr(mark, markWord::monitor_value);
1122     jccb(Assembler::notZero, inflated);
1123     stop("Fast Unlock not monitor");
1124 #endif
1125 
1126     bind(inflated);
1127 
1128     // mark contains the tagged ObjectMonitor*.
1129     const Register monitor = mark;
1130 
1131 #ifndef _LP64
1132     // Check if recursive.
1133     xorptr(reg_rax, reg_rax);
1134     orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1135     jcc(Assembler::notZero, check_successor);
1136 
1137     // Check if the entry lists are empty.
1138     movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
1139     orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
1140     jcc(Assembler::notZero, check_successor);
1141 
1142     // Release lock.
1143     movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
1144 #else // _LP64
1145     Label recursive;
1146 
1147     // Check if recursive.
1148     cmpptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
1149     jccb(Assembler::notEqual, recursive);
1150 
1151     // Check if the entry lists are empty.
1152     movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
1153     orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
1154     jcc(Assembler::notZero, check_successor);
1155 
1156     // Release lock.
1157     movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
1158     jmpb(unlocked);
1159 
1160     // Recursive unlock.
1161     bind(recursive);
1162     decrement(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1163     xorl(t1, t1);
1164 #endif
1165   }
1166 
1167   bind(unlocked);
1168   if (stub != nullptr) {
1169     bind(stub->unlocked_continuation());
1170   }
1171 
1172 #ifdef ASSERT
1173   // Check that unlocked label is reached with ZF set.
1174   Label zf_correct;
1175   jccb(Assembler::zero, zf_correct);
1176   stop("Fast Unlock ZF != 1");
1177 #endif
1178 
1179   if (stub != nullptr) {
1180     bind(stub->slow_path_continuation());
1181   }
1182 #ifdef ASSERT
1183   // Check that stub->continuation() label is reached with ZF not set.
1184   jccb(Assembler::notZero, zf_correct);
1185   stop("Fast Unlock ZF != 0");
1186   bind(zf_correct);
1187 #endif
1188   // C2 uses the value of ZF to determine the continuation.
1189 }
1190 
1191 //-------------------------------------------------------------------------------------------
1192 // Generic instructions support for use in .ad files C2 code generation
1193 
1194 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
1195   if (dst != src) {
1196     movdqu(dst, src);
1197   }
1198   if (opcode == Op_AbsVD) {
1199     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
1200   } else {
1201     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
1202     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1203   }
1204 }
1205 
1206 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
1207   if (opcode == Op_AbsVD) {
1208     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
1209   } else {
1210     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
1211     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
1212   }
1213 }
1214 
1215 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
1216   if (dst != src) {
1217     movdqu(dst, src);
1218   }
1219   if (opcode == Op_AbsVF) {
1220     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
1221   } else {
1222     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
1223     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1224   }
1225 }
1226 
1227 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
1228   if (opcode == Op_AbsVF) {
1229     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
1230   } else {
1231     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
1232     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
1233   }
1234 }
1235 
1236 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
1237   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1238   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
1239 
1240   if (opcode == Op_MinV) {
1241     if (elem_bt == T_BYTE) {
1242       pminsb(dst, src);
1243     } else if (elem_bt == T_SHORT) {
1244       pminsw(dst, src);
1245     } else if (elem_bt == T_INT) {
1246       pminsd(dst, src);
1247     } else {
1248       assert(elem_bt == T_LONG, "required");
1249       assert(tmp == xmm0, "required");
1250       assert_different_registers(dst, src, tmp);
1251       movdqu(xmm0, dst);
1252       pcmpgtq(xmm0, src);
1253       blendvpd(dst, src);  // xmm0 as mask
1254     }
1255   } else { // opcode == Op_MaxV
1256     if (elem_bt == T_BYTE) {
1257       pmaxsb(dst, src);
1258     } else if (elem_bt == T_SHORT) {
1259       pmaxsw(dst, src);
1260     } else if (elem_bt == T_INT) {
1261       pmaxsd(dst, src);
1262     } else {
1263       assert(elem_bt == T_LONG, "required");
1264       assert(tmp == xmm0, "required");
1265       assert_different_registers(dst, src, tmp);
1266       movdqu(xmm0, src);
1267       pcmpgtq(xmm0, dst);
1268       blendvpd(dst, src);  // xmm0 as mask
1269     }
1270   }
1271 }
1272 
1273 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1274                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1275                                  int vlen_enc) {
1276   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1277 
1278   if (opcode == Op_MinV) {
1279     if (elem_bt == T_BYTE) {
1280       vpminsb(dst, src1, src2, vlen_enc);
1281     } else if (elem_bt == T_SHORT) {
1282       vpminsw(dst, src1, src2, vlen_enc);
1283     } else if (elem_bt == T_INT) {
1284       vpminsd(dst, src1, src2, vlen_enc);
1285     } else {
1286       assert(elem_bt == T_LONG, "required");
1287       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1288         vpminsq(dst, src1, src2, vlen_enc);
1289       } else {
1290         assert_different_registers(dst, src1, src2);
1291         vpcmpgtq(dst, src1, src2, vlen_enc);
1292         vblendvpd(dst, src1, src2, dst, vlen_enc);
1293       }
1294     }
1295   } else { // opcode == Op_MaxV
1296     if (elem_bt == T_BYTE) {
1297       vpmaxsb(dst, src1, src2, vlen_enc);
1298     } else if (elem_bt == T_SHORT) {
1299       vpmaxsw(dst, src1, src2, vlen_enc);
1300     } else if (elem_bt == T_INT) {
1301       vpmaxsd(dst, src1, src2, vlen_enc);
1302     } else {
1303       assert(elem_bt == T_LONG, "required");
1304       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1305         vpmaxsq(dst, src1, src2, vlen_enc);
1306       } else {
1307         assert_different_registers(dst, src1, src2);
1308         vpcmpgtq(dst, src1, src2, vlen_enc);
1309         vblendvpd(dst, src2, src1, dst, vlen_enc);
1310       }
1311     }
1312   }
1313 }
1314 
1315 // Float/Double min max
1316 
1317 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1318                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1319                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1320                                    int vlen_enc) {
1321   assert(UseAVX > 0, "required");
1322   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1323          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1324   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1325   assert_different_registers(a, tmp, atmp, btmp);
1326   assert_different_registers(b, tmp, atmp, btmp);
1327 
1328   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1329   bool is_double_word = is_double_word_type(elem_bt);
1330 
1331   /* Note on 'non-obvious' assembly sequence:
1332    *
1333    * While there are vminps/vmaxps instructions, there are two important differences between hardware
1334    * and Java on how they handle floats:
1335    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
1336    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
1337    *
1338    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
1339    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
1340    *                (only useful when signs differ, noop otherwise)
1341    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
1342 
1343    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
1344    *   btmp = (b < +0.0) ? a : b
1345    *   atmp = (b < +0.0) ? b : a
1346    *   Tmp  = Max_Float(atmp , btmp)
1347    *   Res  = (atmp == NaN) ? atmp : Tmp
1348    */
1349 
1350   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
1351   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
1352   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
1353   XMMRegister mask;
1354 
1355   if (!is_double_word && is_min) {
1356     mask = a;
1357     vblend = &MacroAssembler::vblendvps;
1358     vmaxmin = &MacroAssembler::vminps;
1359     vcmp = &MacroAssembler::vcmpps;
1360   } else if (!is_double_word && !is_min) {
1361     mask = b;
1362     vblend = &MacroAssembler::vblendvps;
1363     vmaxmin = &MacroAssembler::vmaxps;
1364     vcmp = &MacroAssembler::vcmpps;
1365   } else if (is_double_word && is_min) {
1366     mask = a;
1367     vblend = &MacroAssembler::vblendvpd;
1368     vmaxmin = &MacroAssembler::vminpd;
1369     vcmp = &MacroAssembler::vcmppd;
1370   } else {
1371     assert(is_double_word && !is_min, "sanity");
1372     mask = b;
1373     vblend = &MacroAssembler::vblendvpd;
1374     vmaxmin = &MacroAssembler::vmaxpd;
1375     vcmp = &MacroAssembler::vcmppd;
1376   }
1377 
1378   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1379   XMMRegister maxmin, scratch;
1380   if (dst == btmp) {
1381     maxmin = btmp;
1382     scratch = tmp;
1383   } else {
1384     maxmin = tmp;
1385     scratch = btmp;
1386   }
1387 
1388   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1389   if (precompute_mask && !is_double_word) {
1390     vpsrad(tmp, mask, 32, vlen_enc);
1391     mask = tmp;
1392   } else if (precompute_mask && is_double_word) {
1393     vpxor(tmp, tmp, tmp, vlen_enc);
1394     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1395     mask = tmp;
1396   }
1397 
1398   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1399   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1400   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1401   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1402   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1403 }
1404 
1405 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1406                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1407                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1408                                     int vlen_enc) {
1409   assert(UseAVX > 2, "required");
1410   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1411          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1412   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1413   assert_different_registers(dst, a, atmp, btmp);
1414   assert_different_registers(dst, b, atmp, btmp);
1415 
1416   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1417   bool is_double_word = is_double_word_type(elem_bt);
1418   bool merge = true;
1419 
1420   if (!is_double_word && is_min) {
1421     evpmovd2m(ktmp, a, vlen_enc);
1422     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1423     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1424     vminps(dst, atmp, btmp, vlen_enc);
1425     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1426     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1427   } else if (!is_double_word && !is_min) {
1428     evpmovd2m(ktmp, b, vlen_enc);
1429     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1430     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1431     vmaxps(dst, atmp, btmp, vlen_enc);
1432     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1433     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1434   } else if (is_double_word && is_min) {
1435     evpmovq2m(ktmp, a, vlen_enc);
1436     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1437     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1438     vminpd(dst, atmp, btmp, vlen_enc);
1439     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1440     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1441   } else {
1442     assert(is_double_word && !is_min, "sanity");
1443     evpmovq2m(ktmp, b, vlen_enc);
1444     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1445     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1446     vmaxpd(dst, atmp, btmp, vlen_enc);
1447     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1448     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1449   }
1450 }
1451 
1452 // Float/Double signum
1453 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1454   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1455 
1456   Label DONE_LABEL;
1457 
1458   if (opcode == Op_SignumF) {
1459     assert(UseSSE > 0, "required");
1460     ucomiss(dst, zero);
1461     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1462     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1463     movflt(dst, one);
1464     jcc(Assembler::above, DONE_LABEL);
1465     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1466   } else if (opcode == Op_SignumD) {
1467     assert(UseSSE > 1, "required");
1468     ucomisd(dst, zero);
1469     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1470     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1471     movdbl(dst, one);
1472     jcc(Assembler::above, DONE_LABEL);
1473     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1474   }
1475 
1476   bind(DONE_LABEL);
1477 }
1478 
1479 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1480   if (sign) {
1481     pmovsxbw(dst, src);
1482   } else {
1483     pmovzxbw(dst, src);
1484   }
1485 }
1486 
1487 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1488   if (sign) {
1489     vpmovsxbw(dst, src, vector_len);
1490   } else {
1491     vpmovzxbw(dst, src, vector_len);
1492   }
1493 }
1494 
1495 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1496   if (sign) {
1497     vpmovsxbd(dst, src, vector_len);
1498   } else {
1499     vpmovzxbd(dst, src, vector_len);
1500   }
1501 }
1502 
1503 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1504   if (sign) {
1505     vpmovsxwd(dst, src, vector_len);
1506   } else {
1507     vpmovzxwd(dst, src, vector_len);
1508   }
1509 }
1510 
1511 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1512                                      int shift, int vector_len) {
1513   if (opcode == Op_RotateLeftV) {
1514     if (etype == T_INT) {
1515       evprold(dst, src, shift, vector_len);
1516     } else {
1517       assert(etype == T_LONG, "expected type T_LONG");
1518       evprolq(dst, src, shift, vector_len);
1519     }
1520   } else {
1521     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1522     if (etype == T_INT) {
1523       evprord(dst, src, shift, vector_len);
1524     } else {
1525       assert(etype == T_LONG, "expected type T_LONG");
1526       evprorq(dst, src, shift, vector_len);
1527     }
1528   }
1529 }
1530 
1531 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1532                                      XMMRegister shift, int vector_len) {
1533   if (opcode == Op_RotateLeftV) {
1534     if (etype == T_INT) {
1535       evprolvd(dst, src, shift, vector_len);
1536     } else {
1537       assert(etype == T_LONG, "expected type T_LONG");
1538       evprolvq(dst, src, shift, vector_len);
1539     }
1540   } else {
1541     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1542     if (etype == T_INT) {
1543       evprorvd(dst, src, shift, vector_len);
1544     } else {
1545       assert(etype == T_LONG, "expected type T_LONG");
1546       evprorvq(dst, src, shift, vector_len);
1547     }
1548   }
1549 }
1550 
1551 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1552   if (opcode == Op_RShiftVI) {
1553     psrad(dst, shift);
1554   } else if (opcode == Op_LShiftVI) {
1555     pslld(dst, shift);
1556   } else {
1557     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1558     psrld(dst, shift);
1559   }
1560 }
1561 
1562 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1563   switch (opcode) {
1564     case Op_RShiftVI:  psrad(dst, shift); break;
1565     case Op_LShiftVI:  pslld(dst, shift); break;
1566     case Op_URShiftVI: psrld(dst, shift); break;
1567 
1568     default: assert(false, "%s", NodeClassNames[opcode]);
1569   }
1570 }
1571 
1572 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1573   if (opcode == Op_RShiftVI) {
1574     vpsrad(dst, nds, shift, vector_len);
1575   } else if (opcode == Op_LShiftVI) {
1576     vpslld(dst, nds, shift, vector_len);
1577   } else {
1578     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1579     vpsrld(dst, nds, shift, vector_len);
1580   }
1581 }
1582 
1583 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1584   switch (opcode) {
1585     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1586     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1587     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1588 
1589     default: assert(false, "%s", NodeClassNames[opcode]);
1590   }
1591 }
1592 
1593 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1594   switch (opcode) {
1595     case Op_RShiftVB:  // fall-through
1596     case Op_RShiftVS:  psraw(dst, shift); break;
1597 
1598     case Op_LShiftVB:  // fall-through
1599     case Op_LShiftVS:  psllw(dst, shift);   break;
1600 
1601     case Op_URShiftVS: // fall-through
1602     case Op_URShiftVB: psrlw(dst, shift);  break;
1603 
1604     default: assert(false, "%s", NodeClassNames[opcode]);
1605   }
1606 }
1607 
1608 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1609   switch (opcode) {
1610     case Op_RShiftVB:  // fall-through
1611     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1612 
1613     case Op_LShiftVB:  // fall-through
1614     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1615 
1616     case Op_URShiftVS: // fall-through
1617     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1618 
1619     default: assert(false, "%s", NodeClassNames[opcode]);
1620   }
1621 }
1622 
1623 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1624   switch (opcode) {
1625     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1626     case Op_LShiftVL:  psllq(dst, shift); break;
1627     case Op_URShiftVL: psrlq(dst, shift); break;
1628 
1629     default: assert(false, "%s", NodeClassNames[opcode]);
1630   }
1631 }
1632 
1633 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1634   if (opcode == Op_RShiftVL) {
1635     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1636   } else if (opcode == Op_LShiftVL) {
1637     psllq(dst, shift);
1638   } else {
1639     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1640     psrlq(dst, shift);
1641   }
1642 }
1643 
1644 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1645   switch (opcode) {
1646     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1647     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1648     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1649 
1650     default: assert(false, "%s", NodeClassNames[opcode]);
1651   }
1652 }
1653 
1654 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1655   if (opcode == Op_RShiftVL) {
1656     evpsraq(dst, nds, shift, vector_len);
1657   } else if (opcode == Op_LShiftVL) {
1658     vpsllq(dst, nds, shift, vector_len);
1659   } else {
1660     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1661     vpsrlq(dst, nds, shift, vector_len);
1662   }
1663 }
1664 
1665 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1666   switch (opcode) {
1667     case Op_RShiftVB:  // fall-through
1668     case Op_RShiftVS:  // fall-through
1669     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1670 
1671     case Op_LShiftVB:  // fall-through
1672     case Op_LShiftVS:  // fall-through
1673     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1674 
1675     case Op_URShiftVB: // fall-through
1676     case Op_URShiftVS: // fall-through
1677     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1678 
1679     default: assert(false, "%s", NodeClassNames[opcode]);
1680   }
1681 }
1682 
1683 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1684   switch (opcode) {
1685     case Op_RShiftVB:  // fall-through
1686     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1687 
1688     case Op_LShiftVB:  // fall-through
1689     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1690 
1691     case Op_URShiftVB: // fall-through
1692     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1693 
1694     default: assert(false, "%s", NodeClassNames[opcode]);
1695   }
1696 }
1697 
1698 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1699   assert(UseAVX >= 2, "required");
1700   switch (opcode) {
1701     case Op_RShiftVL: {
1702       if (UseAVX > 2) {
1703         assert(tmp == xnoreg, "not used");
1704         if (!VM_Version::supports_avx512vl()) {
1705           vlen_enc = Assembler::AVX_512bit;
1706         }
1707         evpsravq(dst, src, shift, vlen_enc);
1708       } else {
1709         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1710         vpsrlvq(dst, src, shift, vlen_enc);
1711         vpsrlvq(tmp, tmp, shift, vlen_enc);
1712         vpxor(dst, dst, tmp, vlen_enc);
1713         vpsubq(dst, dst, tmp, vlen_enc);
1714       }
1715       break;
1716     }
1717     case Op_LShiftVL: {
1718       assert(tmp == xnoreg, "not used");
1719       vpsllvq(dst, src, shift, vlen_enc);
1720       break;
1721     }
1722     case Op_URShiftVL: {
1723       assert(tmp == xnoreg, "not used");
1724       vpsrlvq(dst, src, shift, vlen_enc);
1725       break;
1726     }
1727     default: assert(false, "%s", NodeClassNames[opcode]);
1728   }
1729 }
1730 
1731 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1732 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1733   assert(opcode == Op_LShiftVB ||
1734          opcode == Op_RShiftVB ||
1735          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1736   bool sign = (opcode != Op_URShiftVB);
1737   assert(vector_len == 0, "required");
1738   vextendbd(sign, dst, src, 1);
1739   vpmovzxbd(vtmp, shift, 1);
1740   varshiftd(opcode, dst, dst, vtmp, 1);
1741   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1742   vextracti128_high(vtmp, dst);
1743   vpackusdw(dst, dst, vtmp, 0);
1744 }
1745 
1746 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1747 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1748   assert(opcode == Op_LShiftVB ||
1749          opcode == Op_RShiftVB ||
1750          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1751   bool sign = (opcode != Op_URShiftVB);
1752   int ext_vector_len = vector_len + 1;
1753   vextendbw(sign, dst, src, ext_vector_len);
1754   vpmovzxbw(vtmp, shift, ext_vector_len);
1755   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1756   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1757   if (vector_len == 0) {
1758     vextracti128_high(vtmp, dst);
1759     vpackuswb(dst, dst, vtmp, vector_len);
1760   } else {
1761     vextracti64x4_high(vtmp, dst);
1762     vpackuswb(dst, dst, vtmp, vector_len);
1763     vpermq(dst, dst, 0xD8, vector_len);
1764   }
1765 }
1766 
1767 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1768   switch(typ) {
1769     case T_BYTE:
1770       pinsrb(dst, val, idx);
1771       break;
1772     case T_SHORT:
1773       pinsrw(dst, val, idx);
1774       break;
1775     case T_INT:
1776       pinsrd(dst, val, idx);
1777       break;
1778     case T_LONG:
1779       pinsrq(dst, val, idx);
1780       break;
1781     default:
1782       assert(false,"Should not reach here.");
1783       break;
1784   }
1785 }
1786 
1787 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1788   switch(typ) {
1789     case T_BYTE:
1790       vpinsrb(dst, src, val, idx);
1791       break;
1792     case T_SHORT:
1793       vpinsrw(dst, src, val, idx);
1794       break;
1795     case T_INT:
1796       vpinsrd(dst, src, val, idx);
1797       break;
1798     case T_LONG:
1799       vpinsrq(dst, src, val, idx);
1800       break;
1801     default:
1802       assert(false,"Should not reach here.");
1803       break;
1804   }
1805 }
1806 
1807 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1808   switch(typ) {
1809     case T_INT:
1810       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1811       break;
1812     case T_FLOAT:
1813       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1814       break;
1815     case T_LONG:
1816       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1817       break;
1818     case T_DOUBLE:
1819       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1820       break;
1821     default:
1822       assert(false,"Should not reach here.");
1823       break;
1824   }
1825 }
1826 
1827 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1828   switch(typ) {
1829     case T_INT:
1830       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1831       break;
1832     case T_FLOAT:
1833       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1834       break;
1835     case T_LONG:
1836       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1837       break;
1838     case T_DOUBLE:
1839       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1840       break;
1841     default:
1842       assert(false,"Should not reach here.");
1843       break;
1844   }
1845 }
1846 
1847 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1848   switch(typ) {
1849     case T_INT:
1850       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1851       break;
1852     case T_FLOAT:
1853       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1854       break;
1855     case T_LONG:
1856       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1857       break;
1858     case T_DOUBLE:
1859       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1860       break;
1861     default:
1862       assert(false,"Should not reach here.");
1863       break;
1864   }
1865 }
1866 
1867 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1868   if (vlen_in_bytes <= 16) {
1869     pxor (dst, dst);
1870     psubb(dst, src);
1871     switch (elem_bt) {
1872       case T_BYTE:   /* nothing to do */ break;
1873       case T_SHORT:  pmovsxbw(dst, dst); break;
1874       case T_INT:    pmovsxbd(dst, dst); break;
1875       case T_FLOAT:  pmovsxbd(dst, dst); break;
1876       case T_LONG:   pmovsxbq(dst, dst); break;
1877       case T_DOUBLE: pmovsxbq(dst, dst); break;
1878 
1879       default: assert(false, "%s", type2name(elem_bt));
1880     }
1881   } else {
1882     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1883     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1884 
1885     vpxor (dst, dst, dst, vlen_enc);
1886     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1887 
1888     switch (elem_bt) {
1889       case T_BYTE:   /* nothing to do */            break;
1890       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1891       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1892       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1893       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1894       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1895 
1896       default: assert(false, "%s", type2name(elem_bt));
1897     }
1898   }
1899 }
1900 
1901 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1902   if (novlbwdq) {
1903     vpmovsxbd(xtmp, src, vlen_enc);
1904     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1905             Assembler::eq, true, vlen_enc, noreg);
1906   } else {
1907     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1908     vpsubb(xtmp, xtmp, src, vlen_enc);
1909     evpmovb2m(dst, xtmp, vlen_enc);
1910   }
1911 }
1912 
1913 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
1914   switch (vlen_in_bytes) {
1915     case 4:  movdl(dst, src);   break;
1916     case 8:  movq(dst, src);    break;
1917     case 16: movdqu(dst, src);  break;
1918     case 32: vmovdqu(dst, src); break;
1919     case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1920     default: ShouldNotReachHere();
1921   }
1922 }
1923 
1924 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1925   assert(rscratch != noreg || always_reachable(src), "missing");
1926 
1927   if (reachable(src)) {
1928     load_vector(dst, as_Address(src), vlen_in_bytes);
1929   } else {
1930     lea(rscratch, src);
1931     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
1932   }
1933 }
1934 
1935 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1936   int vlen_enc = vector_length_encoding(vlen);
1937   if (VM_Version::supports_avx()) {
1938     if (bt == T_LONG) {
1939       if (VM_Version::supports_avx2()) {
1940         vpbroadcastq(dst, src, vlen_enc);
1941       } else {
1942         vmovddup(dst, src, vlen_enc);
1943       }
1944     } else if (bt == T_DOUBLE) {
1945       if (vlen_enc != Assembler::AVX_128bit) {
1946         vbroadcastsd(dst, src, vlen_enc, noreg);
1947       } else {
1948         vmovddup(dst, src, vlen_enc);
1949       }
1950     } else {
1951       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1952         vpbroadcastd(dst, src, vlen_enc);
1953       } else {
1954         vbroadcastss(dst, src, vlen_enc);
1955       }
1956     }
1957   } else if (VM_Version::supports_sse3()) {
1958     movddup(dst, src);
1959   } else {
1960     movq(dst, src);
1961     if (vlen == 16) {
1962       punpcklqdq(dst, dst);
1963     }
1964   }
1965 }
1966 
1967 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1968   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1969   int offset = exact_log2(type2aelembytes(bt)) << 6;
1970   if (is_floating_point_type(bt)) {
1971     offset += 128;
1972   }
1973   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1974   load_vector(dst, addr, vlen_in_bytes);
1975 }
1976 
1977 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1978 
1979 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1980   int vector_len = Assembler::AVX_128bit;
1981 
1982   switch (opcode) {
1983     case Op_AndReductionV:  pand(dst, src); break;
1984     case Op_OrReductionV:   por (dst, src); break;
1985     case Op_XorReductionV:  pxor(dst, src); break;
1986     case Op_MinReductionV:
1987       switch (typ) {
1988         case T_BYTE:        pminsb(dst, src); break;
1989         case T_SHORT:       pminsw(dst, src); break;
1990         case T_INT:         pminsd(dst, src); break;
1991         case T_LONG:        assert(UseAVX > 2, "required");
1992                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1993         default:            assert(false, "wrong type");
1994       }
1995       break;
1996     case Op_MaxReductionV:
1997       switch (typ) {
1998         case T_BYTE:        pmaxsb(dst, src); break;
1999         case T_SHORT:       pmaxsw(dst, src); break;
2000         case T_INT:         pmaxsd(dst, src); break;
2001         case T_LONG:        assert(UseAVX > 2, "required");
2002                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
2003         default:            assert(false, "wrong type");
2004       }
2005       break;
2006     case Op_AddReductionVF: addss(dst, src); break;
2007     case Op_AddReductionVD: addsd(dst, src); break;
2008     case Op_AddReductionVI:
2009       switch (typ) {
2010         case T_BYTE:        paddb(dst, src); break;
2011         case T_SHORT:       paddw(dst, src); break;
2012         case T_INT:         paddd(dst, src); break;
2013         default:            assert(false, "wrong type");
2014       }
2015       break;
2016     case Op_AddReductionVL: paddq(dst, src); break;
2017     case Op_MulReductionVF: mulss(dst, src); break;
2018     case Op_MulReductionVD: mulsd(dst, src); break;
2019     case Op_MulReductionVI:
2020       switch (typ) {
2021         case T_SHORT:       pmullw(dst, src); break;
2022         case T_INT:         pmulld(dst, src); break;
2023         default:            assert(false, "wrong type");
2024       }
2025       break;
2026     case Op_MulReductionVL: assert(UseAVX > 2, "required");
2027                             evpmullq(dst, dst, src, vector_len); break;
2028     default:                assert(false, "wrong opcode");
2029   }
2030 }
2031 
2032 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
2033   int vector_len = Assembler::AVX_256bit;
2034 
2035   switch (opcode) {
2036     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
2037     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
2038     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
2039     case Op_MinReductionV:
2040       switch (typ) {
2041         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
2042         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
2043         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
2044         case T_LONG:        assert(UseAVX > 2, "required");
2045                             vpminsq(dst, src1, src2, vector_len); break;
2046         default:            assert(false, "wrong type");
2047       }
2048       break;
2049     case Op_MaxReductionV:
2050       switch (typ) {
2051         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
2052         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
2053         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
2054         case T_LONG:        assert(UseAVX > 2, "required");
2055                             vpmaxsq(dst, src1, src2, vector_len); break;
2056         default:            assert(false, "wrong type");
2057       }
2058       break;
2059     case Op_AddReductionVI:
2060       switch (typ) {
2061         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
2062         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
2063         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
2064         default:            assert(false, "wrong type");
2065       }
2066       break;
2067     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
2068     case Op_MulReductionVI:
2069       switch (typ) {
2070         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
2071         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
2072         default:            assert(false, "wrong type");
2073       }
2074       break;
2075     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
2076     default:                assert(false, "wrong opcode");
2077   }
2078 }
2079 
2080 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
2081                                   XMMRegister dst, XMMRegister src,
2082                                   XMMRegister vtmp1, XMMRegister vtmp2) {
2083   switch (opcode) {
2084     case Op_AddReductionVF:
2085     case Op_MulReductionVF:
2086       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
2087       break;
2088 
2089     case Op_AddReductionVD:
2090     case Op_MulReductionVD:
2091       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
2092       break;
2093 
2094     default: assert(false, "wrong opcode");
2095   }
2096 }
2097 
2098 void C2_MacroAssembler::reduceB(int opcode, int vlen,
2099                              Register dst, Register src1, XMMRegister src2,
2100                              XMMRegister vtmp1, XMMRegister vtmp2) {
2101   switch (vlen) {
2102     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2103     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2104     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2105     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2106 
2107     default: assert(false, "wrong vector length");
2108   }
2109 }
2110 
2111 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
2112                              Register dst, Register src1, XMMRegister src2,
2113                              XMMRegister vtmp1, XMMRegister vtmp2) {
2114   switch (vlen) {
2115     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2116     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2117     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2118     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2119 
2120     default: assert(false, "wrong vector length");
2121   }
2122 }
2123 
2124 void C2_MacroAssembler::reduceS(int opcode, int vlen,
2125                              Register dst, Register src1, XMMRegister src2,
2126                              XMMRegister vtmp1, XMMRegister vtmp2) {
2127   switch (vlen) {
2128     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2129     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2130     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2131     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2132 
2133     default: assert(false, "wrong vector length");
2134   }
2135 }
2136 
2137 void C2_MacroAssembler::reduceI(int opcode, int vlen,
2138                              Register dst, Register src1, XMMRegister src2,
2139                              XMMRegister vtmp1, XMMRegister vtmp2) {
2140   switch (vlen) {
2141     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2142     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2143     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2144     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2145 
2146     default: assert(false, "wrong vector length");
2147   }
2148 }
2149 
2150 #ifdef _LP64
2151 void C2_MacroAssembler::reduceL(int opcode, int vlen,
2152                              Register dst, Register src1, XMMRegister src2,
2153                              XMMRegister vtmp1, XMMRegister vtmp2) {
2154   switch (vlen) {
2155     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2156     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2157     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2158 
2159     default: assert(false, "wrong vector length");
2160   }
2161 }
2162 #endif // _LP64
2163 
2164 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2165   switch (vlen) {
2166     case 2:
2167       assert(vtmp2 == xnoreg, "");
2168       reduce2F(opcode, dst, src, vtmp1);
2169       break;
2170     case 4:
2171       assert(vtmp2 == xnoreg, "");
2172       reduce4F(opcode, dst, src, vtmp1);
2173       break;
2174     case 8:
2175       reduce8F(opcode, dst, src, vtmp1, vtmp2);
2176       break;
2177     case 16:
2178       reduce16F(opcode, dst, src, vtmp1, vtmp2);
2179       break;
2180     default: assert(false, "wrong vector length");
2181   }
2182 }
2183 
2184 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2185   switch (vlen) {
2186     case 2:
2187       assert(vtmp2 == xnoreg, "");
2188       reduce2D(opcode, dst, src, vtmp1);
2189       break;
2190     case 4:
2191       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2192       break;
2193     case 8:
2194       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2195       break;
2196     default: assert(false, "wrong vector length");
2197   }
2198 }
2199 
2200 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2201   if (opcode == Op_AddReductionVI) {
2202     if (vtmp1 != src2) {
2203       movdqu(vtmp1, src2);
2204     }
2205     phaddd(vtmp1, vtmp1);
2206   } else {
2207     pshufd(vtmp1, src2, 0x1);
2208     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2209   }
2210   movdl(vtmp2, src1);
2211   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2212   movdl(dst, vtmp1);
2213 }
2214 
2215 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2216   if (opcode == Op_AddReductionVI) {
2217     if (vtmp1 != src2) {
2218       movdqu(vtmp1, src2);
2219     }
2220     phaddd(vtmp1, src2);
2221     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2222   } else {
2223     pshufd(vtmp2, src2, 0xE);
2224     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2225     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2226   }
2227 }
2228 
2229 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2230   if (opcode == Op_AddReductionVI) {
2231     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2232     vextracti128_high(vtmp2, vtmp1);
2233     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2234     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2235   } else {
2236     vextracti128_high(vtmp1, src2);
2237     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2238     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2239   }
2240 }
2241 
2242 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2243   vextracti64x4_high(vtmp2, src2);
2244   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2245   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2246 }
2247 
2248 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2249   pshufd(vtmp2, src2, 0x1);
2250   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2251   movdqu(vtmp1, vtmp2);
2252   psrldq(vtmp1, 2);
2253   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2254   movdqu(vtmp2, vtmp1);
2255   psrldq(vtmp2, 1);
2256   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2257   movdl(vtmp2, src1);
2258   pmovsxbd(vtmp1, vtmp1);
2259   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2260   pextrb(dst, vtmp1, 0x0);
2261   movsbl(dst, dst);
2262 }
2263 
2264 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2265   pshufd(vtmp1, src2, 0xE);
2266   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2267   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2268 }
2269 
2270 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2271   vextracti128_high(vtmp2, src2);
2272   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2273   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2274 }
2275 
2276 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2277   vextracti64x4_high(vtmp1, src2);
2278   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2279   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2280 }
2281 
2282 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2283   pmovsxbw(vtmp2, src2);
2284   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2285 }
2286 
2287 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2288   if (UseAVX > 1) {
2289     int vector_len = Assembler::AVX_256bit;
2290     vpmovsxbw(vtmp1, src2, vector_len);
2291     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2292   } else {
2293     pmovsxbw(vtmp2, src2);
2294     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2295     pshufd(vtmp2, src2, 0x1);
2296     pmovsxbw(vtmp2, src2);
2297     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2298   }
2299 }
2300 
2301 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2302   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2303     int vector_len = Assembler::AVX_512bit;
2304     vpmovsxbw(vtmp1, src2, vector_len);
2305     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2306   } else {
2307     assert(UseAVX >= 2,"Should not reach here.");
2308     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2309     vextracti128_high(vtmp2, src2);
2310     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2311   }
2312 }
2313 
2314 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2315   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2316   vextracti64x4_high(vtmp2, src2);
2317   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2318 }
2319 
2320 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2321   if (opcode == Op_AddReductionVI) {
2322     if (vtmp1 != src2) {
2323       movdqu(vtmp1, src2);
2324     }
2325     phaddw(vtmp1, vtmp1);
2326     phaddw(vtmp1, vtmp1);
2327   } else {
2328     pshufd(vtmp2, src2, 0x1);
2329     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2330     movdqu(vtmp1, vtmp2);
2331     psrldq(vtmp1, 2);
2332     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2333   }
2334   movdl(vtmp2, src1);
2335   pmovsxwd(vtmp1, vtmp1);
2336   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2337   pextrw(dst, vtmp1, 0x0);
2338   movswl(dst, dst);
2339 }
2340 
2341 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2342   if (opcode == Op_AddReductionVI) {
2343     if (vtmp1 != src2) {
2344       movdqu(vtmp1, src2);
2345     }
2346     phaddw(vtmp1, src2);
2347   } else {
2348     pshufd(vtmp1, src2, 0xE);
2349     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2350   }
2351   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2352 }
2353 
2354 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2355   if (opcode == Op_AddReductionVI) {
2356     int vector_len = Assembler::AVX_256bit;
2357     vphaddw(vtmp2, src2, src2, vector_len);
2358     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2359   } else {
2360     vextracti128_high(vtmp2, src2);
2361     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2362   }
2363   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2364 }
2365 
2366 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2367   int vector_len = Assembler::AVX_256bit;
2368   vextracti64x4_high(vtmp1, src2);
2369   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2370   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2371 }
2372 
2373 #ifdef _LP64
2374 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2375   pshufd(vtmp2, src2, 0xE);
2376   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2377   movdq(vtmp1, src1);
2378   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2379   movdq(dst, vtmp1);
2380 }
2381 
2382 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2383   vextracti128_high(vtmp1, src2);
2384   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2385   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2386 }
2387 
2388 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2389   vextracti64x4_high(vtmp2, src2);
2390   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2391   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2392 }
2393 
2394 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2395   mov64(temp, -1L);
2396   bzhiq(temp, temp, len);
2397   kmovql(dst, temp);
2398 }
2399 #endif // _LP64
2400 
2401 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2402   reduce_operation_128(T_FLOAT, opcode, dst, src);
2403   pshufd(vtmp, src, 0x1);
2404   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2405 }
2406 
2407 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2408   reduce2F(opcode, dst, src, vtmp);
2409   pshufd(vtmp, src, 0x2);
2410   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2411   pshufd(vtmp, src, 0x3);
2412   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2413 }
2414 
2415 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2416   reduce4F(opcode, dst, src, vtmp2);
2417   vextractf128_high(vtmp2, src);
2418   reduce4F(opcode, dst, vtmp2, vtmp1);
2419 }
2420 
2421 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2422   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2423   vextracti64x4_high(vtmp1, src);
2424   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2425 }
2426 
2427 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2428   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2429   pshufd(vtmp, src, 0xE);
2430   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2431 }
2432 
2433 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2434   reduce2D(opcode, dst, src, vtmp2);
2435   vextractf128_high(vtmp2, src);
2436   reduce2D(opcode, dst, vtmp2, vtmp1);
2437 }
2438 
2439 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2440   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2441   vextracti64x4_high(vtmp1, src);
2442   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2443 }
2444 
2445 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2446   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2447 }
2448 
2449 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2450   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2451 }
2452 
2453 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2454                                  int vec_enc) {
2455   switch(elem_bt) {
2456     case T_INT:
2457     case T_FLOAT:
2458       vmaskmovps(dst, src, mask, vec_enc);
2459       break;
2460     case T_LONG:
2461     case T_DOUBLE:
2462       vmaskmovpd(dst, src, mask, vec_enc);
2463       break;
2464     default:
2465       fatal("Unsupported type %s", type2name(elem_bt));
2466       break;
2467   }
2468 }
2469 
2470 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2471                                  int vec_enc) {
2472   switch(elem_bt) {
2473     case T_INT:
2474     case T_FLOAT:
2475       vmaskmovps(dst, src, mask, vec_enc);
2476       break;
2477     case T_LONG:
2478     case T_DOUBLE:
2479       vmaskmovpd(dst, src, mask, vec_enc);
2480       break;
2481     default:
2482       fatal("Unsupported type %s", type2name(elem_bt));
2483       break;
2484   }
2485 }
2486 
2487 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2488                                           XMMRegister dst, XMMRegister src,
2489                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2490                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2491   const int permconst[] = {1, 14};
2492   XMMRegister wsrc = src;
2493   XMMRegister wdst = xmm_0;
2494   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2495 
2496   int vlen_enc = Assembler::AVX_128bit;
2497   if (vlen == 16) {
2498     vlen_enc = Assembler::AVX_256bit;
2499   }
2500 
2501   for (int i = log2(vlen) - 1; i >=0; i--) {
2502     if (i == 0 && !is_dst_valid) {
2503       wdst = dst;
2504     }
2505     if (i == 3) {
2506       vextracti64x4_high(wtmp, wsrc);
2507     } else if (i == 2) {
2508       vextracti128_high(wtmp, wsrc);
2509     } else { // i = [0,1]
2510       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2511     }
2512     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2513     wsrc = wdst;
2514     vlen_enc = Assembler::AVX_128bit;
2515   }
2516   if (is_dst_valid) {
2517     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2518   }
2519 }
2520 
2521 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2522                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2523                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2524   XMMRegister wsrc = src;
2525   XMMRegister wdst = xmm_0;
2526   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2527   int vlen_enc = Assembler::AVX_128bit;
2528   if (vlen == 8) {
2529     vlen_enc = Assembler::AVX_256bit;
2530   }
2531   for (int i = log2(vlen) - 1; i >=0; i--) {
2532     if (i == 0 && !is_dst_valid) {
2533       wdst = dst;
2534     }
2535     if (i == 1) {
2536       vextracti128_high(wtmp, wsrc);
2537     } else if (i == 2) {
2538       vextracti64x4_high(wtmp, wsrc);
2539     } else {
2540       assert(i == 0, "%d", i);
2541       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2542     }
2543     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2544     wsrc = wdst;
2545     vlen_enc = Assembler::AVX_128bit;
2546   }
2547   if (is_dst_valid) {
2548     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2549   }
2550 }
2551 
2552 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2553   switch (bt) {
2554     case T_BYTE:  pextrb(dst, src, idx); break;
2555     case T_SHORT: pextrw(dst, src, idx); break;
2556     case T_INT:   pextrd(dst, src, idx); break;
2557     case T_LONG:  pextrq(dst, src, idx); break;
2558 
2559     default:
2560       assert(false,"Should not reach here.");
2561       break;
2562   }
2563 }
2564 
2565 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2566   int esize =  type2aelembytes(typ);
2567   int elem_per_lane = 16/esize;
2568   int lane = elemindex / elem_per_lane;
2569   int eindex = elemindex % elem_per_lane;
2570 
2571   if (lane >= 2) {
2572     assert(UseAVX > 2, "required");
2573     vextractf32x4(dst, src, lane & 3);
2574     return dst;
2575   } else if (lane > 0) {
2576     assert(UseAVX > 0, "required");
2577     vextractf128(dst, src, lane);
2578     return dst;
2579   } else {
2580     return src;
2581   }
2582 }
2583 
2584 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2585   if (typ == T_BYTE) {
2586     movsbl(dst, dst);
2587   } else if (typ == T_SHORT) {
2588     movswl(dst, dst);
2589   }
2590 }
2591 
2592 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2593   int esize =  type2aelembytes(typ);
2594   int elem_per_lane = 16/esize;
2595   int eindex = elemindex % elem_per_lane;
2596   assert(is_integral_type(typ),"required");
2597 
2598   if (eindex == 0) {
2599     if (typ == T_LONG) {
2600       movq(dst, src);
2601     } else {
2602       movdl(dst, src);
2603       movsxl(typ, dst);
2604     }
2605   } else {
2606     extract(typ, dst, src, eindex);
2607     movsxl(typ, dst);
2608   }
2609 }
2610 
2611 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2612   int esize =  type2aelembytes(typ);
2613   int elem_per_lane = 16/esize;
2614   int eindex = elemindex % elem_per_lane;
2615   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2616 
2617   if (eindex == 0) {
2618     movq(dst, src);
2619   } else {
2620     if (typ == T_FLOAT) {
2621       if (UseAVX == 0) {
2622         movdqu(dst, src);
2623         shufps(dst, dst, eindex);
2624       } else {
2625         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2626       }
2627     } else {
2628       if (UseAVX == 0) {
2629         movdqu(dst, src);
2630         psrldq(dst, eindex*esize);
2631       } else {
2632         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2633       }
2634       movq(dst, dst);
2635     }
2636   }
2637   // Zero upper bits
2638   if (typ == T_FLOAT) {
2639     if (UseAVX == 0) {
2640       assert(vtmp != xnoreg, "required.");
2641       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2642       pand(dst, vtmp);
2643     } else {
2644       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2645     }
2646   }
2647 }
2648 
2649 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2650   switch(typ) {
2651     case T_BYTE:
2652     case T_BOOLEAN:
2653       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2654       break;
2655     case T_SHORT:
2656     case T_CHAR:
2657       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2658       break;
2659     case T_INT:
2660     case T_FLOAT:
2661       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2662       break;
2663     case T_LONG:
2664     case T_DOUBLE:
2665       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2666       break;
2667     default:
2668       assert(false,"Should not reach here.");
2669       break;
2670   }
2671 }
2672 
2673 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2674   assert(rscratch != noreg || always_reachable(src2), "missing");
2675 
2676   switch(typ) {
2677     case T_BOOLEAN:
2678     case T_BYTE:
2679       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2680       break;
2681     case T_CHAR:
2682     case T_SHORT:
2683       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2684       break;
2685     case T_INT:
2686     case T_FLOAT:
2687       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2688       break;
2689     case T_LONG:
2690     case T_DOUBLE:
2691       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2692       break;
2693     default:
2694       assert(false,"Should not reach here.");
2695       break;
2696   }
2697 }
2698 
2699 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2700   switch(typ) {
2701     case T_BYTE:
2702       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2703       break;
2704     case T_SHORT:
2705       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2706       break;
2707     case T_INT:
2708     case T_FLOAT:
2709       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2710       break;
2711     case T_LONG:
2712     case T_DOUBLE:
2713       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2714       break;
2715     default:
2716       assert(false,"Should not reach here.");
2717       break;
2718   }
2719 }
2720 
2721 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2722   assert(vlen_in_bytes <= 32, "");
2723   int esize = type2aelembytes(bt);
2724   if (vlen_in_bytes == 32) {
2725     assert(vtmp == xnoreg, "required.");
2726     if (esize >= 4) {
2727       vtestps(src1, src2, AVX_256bit);
2728     } else {
2729       vptest(src1, src2, AVX_256bit);
2730     }
2731     return;
2732   }
2733   if (vlen_in_bytes < 16) {
2734     // Duplicate the lower part to fill the whole register,
2735     // Don't need to do so for src2
2736     assert(vtmp != xnoreg, "required");
2737     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2738     pshufd(vtmp, src1, shuffle_imm);
2739   } else {
2740     assert(vtmp == xnoreg, "required");
2741     vtmp = src1;
2742   }
2743   if (esize >= 4 && VM_Version::supports_avx()) {
2744     vtestps(vtmp, src2, AVX_128bit);
2745   } else {
2746     ptest(vtmp, src2);
2747   }
2748 }
2749 
2750 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2751   assert(UseAVX >= 2, "required");
2752 #ifdef ASSERT
2753   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2754   bool is_bw_supported = VM_Version::supports_avx512bw();
2755   if (is_bw && !is_bw_supported) {
2756     assert(vlen_enc != Assembler::AVX_512bit, "required");
2757     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2758            "XMM register should be 0-15");
2759   }
2760 #endif // ASSERT
2761   switch (elem_bt) {
2762     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2763     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2764     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2765     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2766     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2767     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2768     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2769   }
2770 }
2771 
2772 #ifdef _LP64
2773 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2774   assert(UseAVX >= 2, "required");
2775   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2776   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2777   if ((UseAVX > 2) &&
2778       (!is_bw || VM_Version::supports_avx512bw()) &&
2779       (!is_vl || VM_Version::supports_avx512vl())) {
2780     switch (elem_bt) {
2781       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2782       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2783       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2784       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2785       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2786     }
2787   } else {
2788     assert(vlen_enc != Assembler::AVX_512bit, "required");
2789     assert((dst->encoding() < 16),"XMM register should be 0-15");
2790     switch (elem_bt) {
2791       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2792       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2793       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2794       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2795       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2796       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2797       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2798     }
2799   }
2800 }
2801 #endif
2802 
2803 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2804   switch (to_elem_bt) {
2805     case T_SHORT:
2806       vpmovsxbw(dst, src, vlen_enc);
2807       break;
2808     case T_INT:
2809       vpmovsxbd(dst, src, vlen_enc);
2810       break;
2811     case T_FLOAT:
2812       vpmovsxbd(dst, src, vlen_enc);
2813       vcvtdq2ps(dst, dst, vlen_enc);
2814       break;
2815     case T_LONG:
2816       vpmovsxbq(dst, src, vlen_enc);
2817       break;
2818     case T_DOUBLE: {
2819       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2820       vpmovsxbd(dst, src, mid_vlen_enc);
2821       vcvtdq2pd(dst, dst, vlen_enc);
2822       break;
2823     }
2824     default:
2825       fatal("Unsupported type %s", type2name(to_elem_bt));
2826       break;
2827   }
2828 }
2829 
2830 //-------------------------------------------------------------------------------------------
2831 
2832 // IndexOf for constant substrings with size >= 8 chars
2833 // which don't need to be loaded through stack.
2834 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2835                                          Register cnt1, Register cnt2,
2836                                          int int_cnt2,  Register result,
2837                                          XMMRegister vec, Register tmp,
2838                                          int ae) {
2839   ShortBranchVerifier sbv(this);
2840   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2841   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2842 
2843   // This method uses the pcmpestri instruction with bound registers
2844   //   inputs:
2845   //     xmm - substring
2846   //     rax - substring length (elements count)
2847   //     mem - scanned string
2848   //     rdx - string length (elements count)
2849   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2850   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2851   //   outputs:
2852   //     rcx - matched index in string
2853   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2854   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2855   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2856   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2857   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2858 
2859   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2860         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2861         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2862 
2863   // Note, inline_string_indexOf() generates checks:
2864   // if (substr.count > string.count) return -1;
2865   // if (substr.count == 0) return 0;
2866   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2867 
2868   // Load substring.
2869   if (ae == StrIntrinsicNode::UL) {
2870     pmovzxbw(vec, Address(str2, 0));
2871   } else {
2872     movdqu(vec, Address(str2, 0));
2873   }
2874   movl(cnt2, int_cnt2);
2875   movptr(result, str1); // string addr
2876 
2877   if (int_cnt2 > stride) {
2878     jmpb(SCAN_TO_SUBSTR);
2879 
2880     // Reload substr for rescan, this code
2881     // is executed only for large substrings (> 8 chars)
2882     bind(RELOAD_SUBSTR);
2883     if (ae == StrIntrinsicNode::UL) {
2884       pmovzxbw(vec, Address(str2, 0));
2885     } else {
2886       movdqu(vec, Address(str2, 0));
2887     }
2888     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2889 
2890     bind(RELOAD_STR);
2891     // We came here after the beginning of the substring was
2892     // matched but the rest of it was not so we need to search
2893     // again. Start from the next element after the previous match.
2894 
2895     // cnt2 is number of substring reminding elements and
2896     // cnt1 is number of string reminding elements when cmp failed.
2897     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2898     subl(cnt1, cnt2);
2899     addl(cnt1, int_cnt2);
2900     movl(cnt2, int_cnt2); // Now restore cnt2
2901 
2902     decrementl(cnt1);     // Shift to next element
2903     cmpl(cnt1, cnt2);
2904     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2905 
2906     addptr(result, (1<<scale1));
2907 
2908   } // (int_cnt2 > 8)
2909 
2910   // Scan string for start of substr in 16-byte vectors
2911   bind(SCAN_TO_SUBSTR);
2912   pcmpestri(vec, Address(result, 0), mode);
2913   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2914   subl(cnt1, stride);
2915   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2916   cmpl(cnt1, cnt2);
2917   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2918   addptr(result, 16);
2919   jmpb(SCAN_TO_SUBSTR);
2920 
2921   // Found a potential substr
2922   bind(FOUND_CANDIDATE);
2923   // Matched whole vector if first element matched (tmp(rcx) == 0).
2924   if (int_cnt2 == stride) {
2925     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2926   } else { // int_cnt2 > 8
2927     jccb(Assembler::overflow, FOUND_SUBSTR);
2928   }
2929   // After pcmpestri tmp(rcx) contains matched element index
2930   // Compute start addr of substr
2931   lea(result, Address(result, tmp, scale1));
2932 
2933   // Make sure string is still long enough
2934   subl(cnt1, tmp);
2935   cmpl(cnt1, cnt2);
2936   if (int_cnt2 == stride) {
2937     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2938   } else { // int_cnt2 > 8
2939     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2940   }
2941   // Left less then substring.
2942 
2943   bind(RET_NOT_FOUND);
2944   movl(result, -1);
2945   jmp(EXIT);
2946 
2947   if (int_cnt2 > stride) {
2948     // This code is optimized for the case when whole substring
2949     // is matched if its head is matched.
2950     bind(MATCH_SUBSTR_HEAD);
2951     pcmpestri(vec, Address(result, 0), mode);
2952     // Reload only string if does not match
2953     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2954 
2955     Label CONT_SCAN_SUBSTR;
2956     // Compare the rest of substring (> 8 chars).
2957     bind(FOUND_SUBSTR);
2958     // First 8 chars are already matched.
2959     negptr(cnt2);
2960     addptr(cnt2, stride);
2961 
2962     bind(SCAN_SUBSTR);
2963     subl(cnt1, stride);
2964     cmpl(cnt2, -stride); // Do not read beyond substring
2965     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2966     // Back-up strings to avoid reading beyond substring:
2967     // cnt1 = cnt1 - cnt2 + 8
2968     addl(cnt1, cnt2); // cnt2 is negative
2969     addl(cnt1, stride);
2970     movl(cnt2, stride); negptr(cnt2);
2971     bind(CONT_SCAN_SUBSTR);
2972     if (int_cnt2 < (int)G) {
2973       int tail_off1 = int_cnt2<<scale1;
2974       int tail_off2 = int_cnt2<<scale2;
2975       if (ae == StrIntrinsicNode::UL) {
2976         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2977       } else {
2978         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2979       }
2980       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2981     } else {
2982       // calculate index in register to avoid integer overflow (int_cnt2*2)
2983       movl(tmp, int_cnt2);
2984       addptr(tmp, cnt2);
2985       if (ae == StrIntrinsicNode::UL) {
2986         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2987       } else {
2988         movdqu(vec, Address(str2, tmp, scale2, 0));
2989       }
2990       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2991     }
2992     // Need to reload strings pointers if not matched whole vector
2993     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2994     addptr(cnt2, stride);
2995     jcc(Assembler::negative, SCAN_SUBSTR);
2996     // Fall through if found full substring
2997 
2998   } // (int_cnt2 > 8)
2999 
3000   bind(RET_FOUND);
3001   // Found result if we matched full small substring.
3002   // Compute substr offset
3003   subptr(result, str1);
3004   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3005     shrl(result, 1); // index
3006   }
3007   bind(EXIT);
3008 
3009 } // string_indexofC8
3010 
3011 // Small strings are loaded through stack if they cross page boundary.
3012 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
3013                                        Register cnt1, Register cnt2,
3014                                        int int_cnt2,  Register result,
3015                                        XMMRegister vec, Register tmp,
3016                                        int ae) {
3017   ShortBranchVerifier sbv(this);
3018   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3019   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
3020 
3021   //
3022   // int_cnt2 is length of small (< 8 chars) constant substring
3023   // or (-1) for non constant substring in which case its length
3024   // is in cnt2 register.
3025   //
3026   // Note, inline_string_indexOf() generates checks:
3027   // if (substr.count > string.count) return -1;
3028   // if (substr.count == 0) return 0;
3029   //
3030   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
3031   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
3032   // This method uses the pcmpestri instruction with bound registers
3033   //   inputs:
3034   //     xmm - substring
3035   //     rax - substring length (elements count)
3036   //     mem - scanned string
3037   //     rdx - string length (elements count)
3038   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
3039   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
3040   //   outputs:
3041   //     rcx - matched index in string
3042   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3043   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
3044   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
3045   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
3046 
3047   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
3048         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
3049         FOUND_CANDIDATE;
3050 
3051   { //========================================================
3052     // We don't know where these strings are located
3053     // and we can't read beyond them. Load them through stack.
3054     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
3055 
3056     movptr(tmp, rsp); // save old SP
3057 
3058     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
3059       if (int_cnt2 == (1>>scale2)) { // One byte
3060         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
3061         load_unsigned_byte(result, Address(str2, 0));
3062         movdl(vec, result); // move 32 bits
3063       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
3064         // Not enough header space in 32-bit VM: 12+3 = 15.
3065         movl(result, Address(str2, -1));
3066         shrl(result, 8);
3067         movdl(vec, result); // move 32 bits
3068       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
3069         load_unsigned_short(result, Address(str2, 0));
3070         movdl(vec, result); // move 32 bits
3071       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
3072         movdl(vec, Address(str2, 0)); // move 32 bits
3073       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
3074         movq(vec, Address(str2, 0));  // move 64 bits
3075       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3076         // Array header size is 12 bytes in 32-bit VM
3077         // + 6 bytes for 3 chars == 18 bytes,
3078         // enough space to load vec and shift.
3079         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3080         if (ae == StrIntrinsicNode::UL) {
3081           int tail_off = int_cnt2-8;
3082           pmovzxbw(vec, Address(str2, tail_off));
3083           psrldq(vec, -2*tail_off);
3084         }
3085         else {
3086           int tail_off = int_cnt2*(1<<scale2);
3087           movdqu(vec, Address(str2, tail_off-16));
3088           psrldq(vec, 16-tail_off);
3089         }
3090       }
3091     } else { // not constant substring
3092       cmpl(cnt2, stride);
3093       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3094 
3095       // We can read beyond string if srt+16 does not cross page boundary
3096       // since heaps are aligned and mapped by pages.
3097       assert(os::vm_page_size() < (int)G, "default page should be small");
3098       movl(result, str2); // We need only low 32 bits
3099       andl(result, ((int)os::vm_page_size()-1));
3100       cmpl(result, ((int)os::vm_page_size()-16));
3101       jccb(Assembler::belowEqual, CHECK_STR);
3102 
3103       // Move small strings to stack to allow load 16 bytes into vec.
3104       subptr(rsp, 16);
3105       int stk_offset = wordSize-(1<<scale2);
3106       push(cnt2);
3107 
3108       bind(COPY_SUBSTR);
3109       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3110         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3111         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3112       } else if (ae == StrIntrinsicNode::UU) {
3113         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3114         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3115       }
3116       decrement(cnt2);
3117       jccb(Assembler::notZero, COPY_SUBSTR);
3118 
3119       pop(cnt2);
3120       movptr(str2, rsp);  // New substring address
3121     } // non constant
3122 
3123     bind(CHECK_STR);
3124     cmpl(cnt1, stride);
3125     jccb(Assembler::aboveEqual, BIG_STRINGS);
3126 
3127     // Check cross page boundary.
3128     movl(result, str1); // We need only low 32 bits
3129     andl(result, ((int)os::vm_page_size()-1));
3130     cmpl(result, ((int)os::vm_page_size()-16));
3131     jccb(Assembler::belowEqual, BIG_STRINGS);
3132 
3133     subptr(rsp, 16);
3134     int stk_offset = -(1<<scale1);
3135     if (int_cnt2 < 0) { // not constant
3136       push(cnt2);
3137       stk_offset += wordSize;
3138     }
3139     movl(cnt2, cnt1);
3140 
3141     bind(COPY_STR);
3142     if (ae == StrIntrinsicNode::LL) {
3143       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3144       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3145     } else {
3146       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3147       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3148     }
3149     decrement(cnt2);
3150     jccb(Assembler::notZero, COPY_STR);
3151 
3152     if (int_cnt2 < 0) { // not constant
3153       pop(cnt2);
3154     }
3155     movptr(str1, rsp);  // New string address
3156 
3157     bind(BIG_STRINGS);
3158     // Load substring.
3159     if (int_cnt2 < 0) { // -1
3160       if (ae == StrIntrinsicNode::UL) {
3161         pmovzxbw(vec, Address(str2, 0));
3162       } else {
3163         movdqu(vec, Address(str2, 0));
3164       }
3165       push(cnt2);       // substr count
3166       push(str2);       // substr addr
3167       push(str1);       // string addr
3168     } else {
3169       // Small (< 8 chars) constant substrings are loaded already.
3170       movl(cnt2, int_cnt2);
3171     }
3172     push(tmp);  // original SP
3173 
3174   } // Finished loading
3175 
3176   //========================================================
3177   // Start search
3178   //
3179 
3180   movptr(result, str1); // string addr
3181 
3182   if (int_cnt2  < 0) {  // Only for non constant substring
3183     jmpb(SCAN_TO_SUBSTR);
3184 
3185     // SP saved at sp+0
3186     // String saved at sp+1*wordSize
3187     // Substr saved at sp+2*wordSize
3188     // Substr count saved at sp+3*wordSize
3189 
3190     // Reload substr for rescan, this code
3191     // is executed only for large substrings (> 8 chars)
3192     bind(RELOAD_SUBSTR);
3193     movptr(str2, Address(rsp, 2*wordSize));
3194     movl(cnt2, Address(rsp, 3*wordSize));
3195     if (ae == StrIntrinsicNode::UL) {
3196       pmovzxbw(vec, Address(str2, 0));
3197     } else {
3198       movdqu(vec, Address(str2, 0));
3199     }
3200     // We came here after the beginning of the substring was
3201     // matched but the rest of it was not so we need to search
3202     // again. Start from the next element after the previous match.
3203     subptr(str1, result); // Restore counter
3204     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3205       shrl(str1, 1);
3206     }
3207     addl(cnt1, str1);
3208     decrementl(cnt1);   // Shift to next element
3209     cmpl(cnt1, cnt2);
3210     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3211 
3212     addptr(result, (1<<scale1));
3213   } // non constant
3214 
3215   // Scan string for start of substr in 16-byte vectors
3216   bind(SCAN_TO_SUBSTR);
3217   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3218   pcmpestri(vec, Address(result, 0), mode);
3219   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3220   subl(cnt1, stride);
3221   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3222   cmpl(cnt1, cnt2);
3223   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3224   addptr(result, 16);
3225 
3226   bind(ADJUST_STR);
3227   cmpl(cnt1, stride); // Do not read beyond string
3228   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3229   // Back-up string to avoid reading beyond string.
3230   lea(result, Address(result, cnt1, scale1, -16));
3231   movl(cnt1, stride);
3232   jmpb(SCAN_TO_SUBSTR);
3233 
3234   // Found a potential substr
3235   bind(FOUND_CANDIDATE);
3236   // After pcmpestri tmp(rcx) contains matched element index
3237 
3238   // Make sure string is still long enough
3239   subl(cnt1, tmp);
3240   cmpl(cnt1, cnt2);
3241   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3242   // Left less then substring.
3243 
3244   bind(RET_NOT_FOUND);
3245   movl(result, -1);
3246   jmp(CLEANUP);
3247 
3248   bind(FOUND_SUBSTR);
3249   // Compute start addr of substr
3250   lea(result, Address(result, tmp, scale1));
3251   if (int_cnt2 > 0) { // Constant substring
3252     // Repeat search for small substring (< 8 chars)
3253     // from new point without reloading substring.
3254     // Have to check that we don't read beyond string.
3255     cmpl(tmp, stride-int_cnt2);
3256     jccb(Assembler::greater, ADJUST_STR);
3257     // Fall through if matched whole substring.
3258   } else { // non constant
3259     assert(int_cnt2 == -1, "should be != 0");
3260 
3261     addl(tmp, cnt2);
3262     // Found result if we matched whole substring.
3263     cmpl(tmp, stride);
3264     jcc(Assembler::lessEqual, RET_FOUND);
3265 
3266     // Repeat search for small substring (<= 8 chars)
3267     // from new point 'str1' without reloading substring.
3268     cmpl(cnt2, stride);
3269     // Have to check that we don't read beyond string.
3270     jccb(Assembler::lessEqual, ADJUST_STR);
3271 
3272     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3273     // Compare the rest of substring (> 8 chars).
3274     movptr(str1, result);
3275 
3276     cmpl(tmp, cnt2);
3277     // First 8 chars are already matched.
3278     jccb(Assembler::equal, CHECK_NEXT);
3279 
3280     bind(SCAN_SUBSTR);
3281     pcmpestri(vec, Address(str1, 0), mode);
3282     // Need to reload strings pointers if not matched whole vector
3283     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3284 
3285     bind(CHECK_NEXT);
3286     subl(cnt2, stride);
3287     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3288     addptr(str1, 16);
3289     if (ae == StrIntrinsicNode::UL) {
3290       addptr(str2, 8);
3291     } else {
3292       addptr(str2, 16);
3293     }
3294     subl(cnt1, stride);
3295     cmpl(cnt2, stride); // Do not read beyond substring
3296     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3297     // Back-up strings to avoid reading beyond substring.
3298 
3299     if (ae == StrIntrinsicNode::UL) {
3300       lea(str2, Address(str2, cnt2, scale2, -8));
3301       lea(str1, Address(str1, cnt2, scale1, -16));
3302     } else {
3303       lea(str2, Address(str2, cnt2, scale2, -16));
3304       lea(str1, Address(str1, cnt2, scale1, -16));
3305     }
3306     subl(cnt1, cnt2);
3307     movl(cnt2, stride);
3308     addl(cnt1, stride);
3309     bind(CONT_SCAN_SUBSTR);
3310     if (ae == StrIntrinsicNode::UL) {
3311       pmovzxbw(vec, Address(str2, 0));
3312     } else {
3313       movdqu(vec, Address(str2, 0));
3314     }
3315     jmp(SCAN_SUBSTR);
3316 
3317     bind(RET_FOUND_LONG);
3318     movptr(str1, Address(rsp, wordSize));
3319   } // non constant
3320 
3321   bind(RET_FOUND);
3322   // Compute substr offset
3323   subptr(result, str1);
3324   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3325     shrl(result, 1); // index
3326   }
3327   bind(CLEANUP);
3328   pop(rsp); // restore SP
3329 
3330 } // string_indexof
3331 
3332 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3333                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3334   ShortBranchVerifier sbv(this);
3335   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3336 
3337   int stride = 8;
3338 
3339   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3340         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3341         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3342         FOUND_SEQ_CHAR, DONE_LABEL;
3343 
3344   movptr(result, str1);
3345   if (UseAVX >= 2) {
3346     cmpl(cnt1, stride);
3347     jcc(Assembler::less, SCAN_TO_CHAR);
3348     cmpl(cnt1, 2*stride);
3349     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3350     movdl(vec1, ch);
3351     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3352     vpxor(vec2, vec2);
3353     movl(tmp, cnt1);
3354     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3355     andl(cnt1,0x0000000F);  //tail count (in chars)
3356 
3357     bind(SCAN_TO_16_CHAR_LOOP);
3358     vmovdqu(vec3, Address(result, 0));
3359     vpcmpeqw(vec3, vec3, vec1, 1);
3360     vptest(vec2, vec3);
3361     jcc(Assembler::carryClear, FOUND_CHAR);
3362     addptr(result, 32);
3363     subl(tmp, 2*stride);
3364     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3365     jmp(SCAN_TO_8_CHAR);
3366     bind(SCAN_TO_8_CHAR_INIT);
3367     movdl(vec1, ch);
3368     pshuflw(vec1, vec1, 0x00);
3369     pshufd(vec1, vec1, 0);
3370     pxor(vec2, vec2);
3371   }
3372   bind(SCAN_TO_8_CHAR);
3373   cmpl(cnt1, stride);
3374   jcc(Assembler::less, SCAN_TO_CHAR);
3375   if (UseAVX < 2) {
3376     movdl(vec1, ch);
3377     pshuflw(vec1, vec1, 0x00);
3378     pshufd(vec1, vec1, 0);
3379     pxor(vec2, vec2);
3380   }
3381   movl(tmp, cnt1);
3382   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3383   andl(cnt1,0x00000007);  //tail count (in chars)
3384 
3385   bind(SCAN_TO_8_CHAR_LOOP);
3386   movdqu(vec3, Address(result, 0));
3387   pcmpeqw(vec3, vec1);
3388   ptest(vec2, vec3);
3389   jcc(Assembler::carryClear, FOUND_CHAR);
3390   addptr(result, 16);
3391   subl(tmp, stride);
3392   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3393   bind(SCAN_TO_CHAR);
3394   testl(cnt1, cnt1);
3395   jcc(Assembler::zero, RET_NOT_FOUND);
3396   bind(SCAN_TO_CHAR_LOOP);
3397   load_unsigned_short(tmp, Address(result, 0));
3398   cmpl(ch, tmp);
3399   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3400   addptr(result, 2);
3401   subl(cnt1, 1);
3402   jccb(Assembler::zero, RET_NOT_FOUND);
3403   jmp(SCAN_TO_CHAR_LOOP);
3404 
3405   bind(RET_NOT_FOUND);
3406   movl(result, -1);
3407   jmpb(DONE_LABEL);
3408 
3409   bind(FOUND_CHAR);
3410   if (UseAVX >= 2) {
3411     vpmovmskb(tmp, vec3);
3412   } else {
3413     pmovmskb(tmp, vec3);
3414   }
3415   bsfl(ch, tmp);
3416   addptr(result, ch);
3417 
3418   bind(FOUND_SEQ_CHAR);
3419   subptr(result, str1);
3420   shrl(result, 1);
3421 
3422   bind(DONE_LABEL);
3423 } // string_indexof_char
3424 
3425 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3426                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3427   ShortBranchVerifier sbv(this);
3428   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3429 
3430   int stride = 16;
3431 
3432   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3433         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3434         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3435         FOUND_SEQ_CHAR, DONE_LABEL;
3436 
3437   movptr(result, str1);
3438   if (UseAVX >= 2) {
3439     cmpl(cnt1, stride);
3440     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3441     cmpl(cnt1, stride*2);
3442     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3443     movdl(vec1, ch);
3444     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3445     vpxor(vec2, vec2);
3446     movl(tmp, cnt1);
3447     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3448     andl(cnt1,0x0000001F);  //tail count (in chars)
3449 
3450     bind(SCAN_TO_32_CHAR_LOOP);
3451     vmovdqu(vec3, Address(result, 0));
3452     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3453     vptest(vec2, vec3);
3454     jcc(Assembler::carryClear, FOUND_CHAR);
3455     addptr(result, 32);
3456     subl(tmp, stride*2);
3457     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3458     jmp(SCAN_TO_16_CHAR);
3459 
3460     bind(SCAN_TO_16_CHAR_INIT);
3461     movdl(vec1, ch);
3462     pxor(vec2, vec2);
3463     pshufb(vec1, vec2);
3464   }
3465 
3466   bind(SCAN_TO_16_CHAR);
3467   cmpl(cnt1, stride);
3468   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3469   if (UseAVX < 2) {
3470     movdl(vec1, ch);
3471     pxor(vec2, vec2);
3472     pshufb(vec1, vec2);
3473   }
3474   movl(tmp, cnt1);
3475   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3476   andl(cnt1,0x0000000F);  //tail count (in bytes)
3477 
3478   bind(SCAN_TO_16_CHAR_LOOP);
3479   movdqu(vec3, Address(result, 0));
3480   pcmpeqb(vec3, vec1);
3481   ptest(vec2, vec3);
3482   jcc(Assembler::carryClear, FOUND_CHAR);
3483   addptr(result, 16);
3484   subl(tmp, stride);
3485   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3486 
3487   bind(SCAN_TO_CHAR_INIT);
3488   testl(cnt1, cnt1);
3489   jcc(Assembler::zero, RET_NOT_FOUND);
3490   bind(SCAN_TO_CHAR_LOOP);
3491   load_unsigned_byte(tmp, Address(result, 0));
3492   cmpl(ch, tmp);
3493   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3494   addptr(result, 1);
3495   subl(cnt1, 1);
3496   jccb(Assembler::zero, RET_NOT_FOUND);
3497   jmp(SCAN_TO_CHAR_LOOP);
3498 
3499   bind(RET_NOT_FOUND);
3500   movl(result, -1);
3501   jmpb(DONE_LABEL);
3502 
3503   bind(FOUND_CHAR);
3504   if (UseAVX >= 2) {
3505     vpmovmskb(tmp, vec3);
3506   } else {
3507     pmovmskb(tmp, vec3);
3508   }
3509   bsfl(ch, tmp);
3510   addptr(result, ch);
3511 
3512   bind(FOUND_SEQ_CHAR);
3513   subptr(result, str1);
3514 
3515   bind(DONE_LABEL);
3516 } // stringL_indexof_char
3517 
3518 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3519   switch (eltype) {
3520   case T_BOOLEAN: return sizeof(jboolean);
3521   case T_BYTE:  return sizeof(jbyte);
3522   case T_SHORT: return sizeof(jshort);
3523   case T_CHAR:  return sizeof(jchar);
3524   case T_INT:   return sizeof(jint);
3525   default:
3526     ShouldNotReachHere();
3527     return -1;
3528   }
3529 }
3530 
3531 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3532   switch (eltype) {
3533   // T_BOOLEAN used as surrogate for unsigned byte
3534   case T_BOOLEAN: movzbl(dst, src);   break;
3535   case T_BYTE:    movsbl(dst, src);   break;
3536   case T_SHORT:   movswl(dst, src);   break;
3537   case T_CHAR:    movzwl(dst, src);   break;
3538   case T_INT:     movl(dst, src);     break;
3539   default:
3540     ShouldNotReachHere();
3541   }
3542 }
3543 
3544 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3545   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3546 }
3547 
3548 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3549   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3550 }
3551 
3552 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3553   const int vlen = Assembler::AVX_256bit;
3554   switch (eltype) {
3555   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3556   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3557   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3558   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3559   case T_INT:
3560     // do nothing
3561     break;
3562   default:
3563     ShouldNotReachHere();
3564   }
3565 }
3566 
3567 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3568                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3569                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3570                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3571                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3572                                         BasicType eltype) {
3573   ShortBranchVerifier sbv(this);
3574   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3575   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3576   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3577 
3578   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3579         SHORT_UNROLLED_LOOP_EXIT,
3580         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3581         UNROLLED_VECTOR_LOOP_BEGIN,
3582         END;
3583   switch (eltype) {
3584   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3585   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3586   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3587   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3588   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3589   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3590   }
3591 
3592   // For "renaming" for readibility of the code
3593   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3594                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3595                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3596 
3597   const int elsize = arrays_hashcode_elsize(eltype);
3598 
3599   /*
3600     if (cnt1 >= 2) {
3601       if (cnt1 >= 32) {
3602         UNROLLED VECTOR LOOP
3603       }
3604       UNROLLED SCALAR LOOP
3605     }
3606     SINGLE SCALAR
3607    */
3608 
3609   cmpl(cnt1, 32);
3610   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3611 
3612   // cnt1 >= 32 && generate_vectorized_loop
3613   xorl(index, index);
3614 
3615   // vresult = IntVector.zero(I256);
3616   for (int idx = 0; idx < 4; idx++) {
3617     vpxor(vresult[idx], vresult[idx]);
3618   }
3619   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3620   Register bound = tmp2;
3621   Register next = tmp3;
3622   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3623   movl(next, Address(tmp2, 0));
3624   movdl(vnext, next);
3625   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3626 
3627   // index = 0;
3628   // bound = cnt1 & ~(32 - 1);
3629   movl(bound, cnt1);
3630   andl(bound, ~(32 - 1));
3631   // for (; index < bound; index += 32) {
3632   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3633   // result *= next;
3634   imull(result, next);
3635   // loop fission to upfront the cost of fetching from memory, OOO execution
3636   // can then hopefully do a better job of prefetching
3637   for (int idx = 0; idx < 4; idx++) {
3638     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3639   }
3640   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3641   for (int idx = 0; idx < 4; idx++) {
3642     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3643     arrays_hashcode_elvcast(vtmp[idx], eltype);
3644     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3645   }
3646   // index += 32;
3647   addl(index, 32);
3648   // index < bound;
3649   cmpl(index, bound);
3650   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3651   // }
3652 
3653   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3654   subl(cnt1, bound);
3655   // release bound
3656 
3657   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3658   for (int idx = 0; idx < 4; idx++) {
3659     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3660     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3661     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3662   }
3663   // result += vresult.reduceLanes(ADD);
3664   for (int idx = 0; idx < 4; idx++) {
3665     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3666   }
3667 
3668   // } else if (cnt1 < 32) {
3669 
3670   bind(SHORT_UNROLLED_BEGIN);
3671   // int i = 1;
3672   movl(index, 1);
3673   cmpl(index, cnt1);
3674   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3675 
3676   // for (; i < cnt1 ; i += 2) {
3677   bind(SHORT_UNROLLED_LOOP_BEGIN);
3678   movl(tmp3, 961);
3679   imull(result, tmp3);
3680   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3681   movl(tmp3, tmp2);
3682   shll(tmp3, 5);
3683   subl(tmp3, tmp2);
3684   addl(result, tmp3);
3685   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3686   addl(result, tmp3);
3687   addl(index, 2);
3688   cmpl(index, cnt1);
3689   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3690 
3691   // }
3692   // if (i >= cnt1) {
3693   bind(SHORT_UNROLLED_LOOP_EXIT);
3694   jccb(Assembler::greater, END);
3695   movl(tmp2, result);
3696   shll(result, 5);
3697   subl(result, tmp2);
3698   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3699   addl(result, tmp3);
3700   // }
3701   bind(END);
3702 
3703   BLOCK_COMMENT("} // arrays_hashcode");
3704 
3705 } // arrays_hashcode
3706 
3707 // helper function for string_compare
3708 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3709                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3710                                            Address::ScaleFactor scale2, Register index, int ae) {
3711   if (ae == StrIntrinsicNode::LL) {
3712     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3713     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3714   } else if (ae == StrIntrinsicNode::UU) {
3715     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3716     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3717   } else {
3718     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3719     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3720   }
3721 }
3722 
3723 // Compare strings, used for char[] and byte[].
3724 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3725                                        Register cnt1, Register cnt2, Register result,
3726                                        XMMRegister vec1, int ae, KRegister mask) {
3727   ShortBranchVerifier sbv(this);
3728   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3729   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3730   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3731   int stride2x2 = 0x40;
3732   Address::ScaleFactor scale = Address::no_scale;
3733   Address::ScaleFactor scale1 = Address::no_scale;
3734   Address::ScaleFactor scale2 = Address::no_scale;
3735 
3736   if (ae != StrIntrinsicNode::LL) {
3737     stride2x2 = 0x20;
3738   }
3739 
3740   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3741     shrl(cnt2, 1);
3742   }
3743   // Compute the minimum of the string lengths and the
3744   // difference of the string lengths (stack).
3745   // Do the conditional move stuff
3746   movl(result, cnt1);
3747   subl(cnt1, cnt2);
3748   push(cnt1);
3749   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3750 
3751   // Is the minimum length zero?
3752   testl(cnt2, cnt2);
3753   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3754   if (ae == StrIntrinsicNode::LL) {
3755     // Load first bytes
3756     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3757     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3758   } else if (ae == StrIntrinsicNode::UU) {
3759     // Load first characters
3760     load_unsigned_short(result, Address(str1, 0));
3761     load_unsigned_short(cnt1, Address(str2, 0));
3762   } else {
3763     load_unsigned_byte(result, Address(str1, 0));
3764     load_unsigned_short(cnt1, Address(str2, 0));
3765   }
3766   subl(result, cnt1);
3767   jcc(Assembler::notZero,  POP_LABEL);
3768 
3769   if (ae == StrIntrinsicNode::UU) {
3770     // Divide length by 2 to get number of chars
3771     shrl(cnt2, 1);
3772   }
3773   cmpl(cnt2, 1);
3774   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3775 
3776   // Check if the strings start at the same location and setup scale and stride
3777   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3778     cmpptr(str1, str2);
3779     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3780     if (ae == StrIntrinsicNode::LL) {
3781       scale = Address::times_1;
3782       stride = 16;
3783     } else {
3784       scale = Address::times_2;
3785       stride = 8;
3786     }
3787   } else {
3788     scale1 = Address::times_1;
3789     scale2 = Address::times_2;
3790     // scale not used
3791     stride = 8;
3792   }
3793 
3794   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3795     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3796     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3797     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3798     Label COMPARE_TAIL_LONG;
3799     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3800 
3801     int pcmpmask = 0x19;
3802     if (ae == StrIntrinsicNode::LL) {
3803       pcmpmask &= ~0x01;
3804     }
3805 
3806     // Setup to compare 16-chars (32-bytes) vectors,
3807     // start from first character again because it has aligned address.
3808     if (ae == StrIntrinsicNode::LL) {
3809       stride2 = 32;
3810     } else {
3811       stride2 = 16;
3812     }
3813     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3814       adr_stride = stride << scale;
3815     } else {
3816       adr_stride1 = 8;  //stride << scale1;
3817       adr_stride2 = 16; //stride << scale2;
3818     }
3819 
3820     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3821     // rax and rdx are used by pcmpestri as elements counters
3822     movl(result, cnt2);
3823     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3824     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3825 
3826     // fast path : compare first 2 8-char vectors.
3827     bind(COMPARE_16_CHARS);
3828     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3829       movdqu(vec1, Address(str1, 0));
3830     } else {
3831       pmovzxbw(vec1, Address(str1, 0));
3832     }
3833     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3834     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3835 
3836     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3837       movdqu(vec1, Address(str1, adr_stride));
3838       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3839     } else {
3840       pmovzxbw(vec1, Address(str1, adr_stride1));
3841       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3842     }
3843     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3844     addl(cnt1, stride);
3845 
3846     // Compare the characters at index in cnt1
3847     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3848     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3849     subl(result, cnt2);
3850     jmp(POP_LABEL);
3851 
3852     // Setup the registers to start vector comparison loop
3853     bind(COMPARE_WIDE_VECTORS);
3854     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3855       lea(str1, Address(str1, result, scale));
3856       lea(str2, Address(str2, result, scale));
3857     } else {
3858       lea(str1, Address(str1, result, scale1));
3859       lea(str2, Address(str2, result, scale2));
3860     }
3861     subl(result, stride2);
3862     subl(cnt2, stride2);
3863     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3864     negptr(result);
3865 
3866     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3867     bind(COMPARE_WIDE_VECTORS_LOOP);
3868 
3869 #ifdef _LP64
3870     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3871       cmpl(cnt2, stride2x2);
3872       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3873       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3874       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3875 
3876       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3877       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3878         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3879         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3880       } else {
3881         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3882         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3883       }
3884       kortestql(mask, mask);
3885       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3886       addptr(result, stride2x2);  // update since we already compared at this addr
3887       subl(cnt2, stride2x2);      // and sub the size too
3888       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3889 
3890       vpxor(vec1, vec1);
3891       jmpb(COMPARE_WIDE_TAIL);
3892     }//if (VM_Version::supports_avx512vlbw())
3893 #endif // _LP64
3894 
3895 
3896     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3897     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3898       vmovdqu(vec1, Address(str1, result, scale));
3899       vpxor(vec1, Address(str2, result, scale));
3900     } else {
3901       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3902       vpxor(vec1, Address(str2, result, scale2));
3903     }
3904     vptest(vec1, vec1);
3905     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3906     addptr(result, stride2);
3907     subl(cnt2, stride2);
3908     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3909     // clean upper bits of YMM registers
3910     vpxor(vec1, vec1);
3911 
3912     // compare wide vectors tail
3913     bind(COMPARE_WIDE_TAIL);
3914     testptr(result, result);
3915     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3916 
3917     movl(result, stride2);
3918     movl(cnt2, result);
3919     negptr(result);
3920     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3921 
3922     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3923     bind(VECTOR_NOT_EQUAL);
3924     // clean upper bits of YMM registers
3925     vpxor(vec1, vec1);
3926     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3927       lea(str1, Address(str1, result, scale));
3928       lea(str2, Address(str2, result, scale));
3929     } else {
3930       lea(str1, Address(str1, result, scale1));
3931       lea(str2, Address(str2, result, scale2));
3932     }
3933     jmp(COMPARE_16_CHARS);
3934 
3935     // Compare tail chars, length between 1 to 15 chars
3936     bind(COMPARE_TAIL_LONG);
3937     movl(cnt2, result);
3938     cmpl(cnt2, stride);
3939     jcc(Assembler::less, COMPARE_SMALL_STR);
3940 
3941     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3942       movdqu(vec1, Address(str1, 0));
3943     } else {
3944       pmovzxbw(vec1, Address(str1, 0));
3945     }
3946     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3947     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3948     subptr(cnt2, stride);
3949     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3950     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3951       lea(str1, Address(str1, result, scale));
3952       lea(str2, Address(str2, result, scale));
3953     } else {
3954       lea(str1, Address(str1, result, scale1));
3955       lea(str2, Address(str2, result, scale2));
3956     }
3957     negptr(cnt2);
3958     jmpb(WHILE_HEAD_LABEL);
3959 
3960     bind(COMPARE_SMALL_STR);
3961   } else if (UseSSE42Intrinsics) {
3962     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3963     int pcmpmask = 0x19;
3964     // Setup to compare 8-char (16-byte) vectors,
3965     // start from first character again because it has aligned address.
3966     movl(result, cnt2);
3967     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3968     if (ae == StrIntrinsicNode::LL) {
3969       pcmpmask &= ~0x01;
3970     }
3971     jcc(Assembler::zero, COMPARE_TAIL);
3972     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3973       lea(str1, Address(str1, result, scale));
3974       lea(str2, Address(str2, result, scale));
3975     } else {
3976       lea(str1, Address(str1, result, scale1));
3977       lea(str2, Address(str2, result, scale2));
3978     }
3979     negptr(result);
3980 
3981     // pcmpestri
3982     //   inputs:
3983     //     vec1- substring
3984     //     rax - negative string length (elements count)
3985     //     mem - scanned string
3986     //     rdx - string length (elements count)
3987     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3988     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3989     //   outputs:
3990     //     rcx - first mismatched element index
3991     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3992 
3993     bind(COMPARE_WIDE_VECTORS);
3994     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3995       movdqu(vec1, Address(str1, result, scale));
3996       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3997     } else {
3998       pmovzxbw(vec1, Address(str1, result, scale1));
3999       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
4000     }
4001     // After pcmpestri cnt1(rcx) contains mismatched element index
4002 
4003     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
4004     addptr(result, stride);
4005     subptr(cnt2, stride);
4006     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4007 
4008     // compare wide vectors tail
4009     testptr(result, result);
4010     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
4011 
4012     movl(cnt2, stride);
4013     movl(result, stride);
4014     negptr(result);
4015     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4016       movdqu(vec1, Address(str1, result, scale));
4017       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
4018     } else {
4019       pmovzxbw(vec1, Address(str1, result, scale1));
4020       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
4021     }
4022     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
4023 
4024     // Mismatched characters in the vectors
4025     bind(VECTOR_NOT_EQUAL);
4026     addptr(cnt1, result);
4027     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
4028     subl(result, cnt2);
4029     jmpb(POP_LABEL);
4030 
4031     bind(COMPARE_TAIL); // limit is zero
4032     movl(cnt2, result);
4033     // Fallthru to tail compare
4034   }
4035   // Shift str2 and str1 to the end of the arrays, negate min
4036   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4037     lea(str1, Address(str1, cnt2, scale));
4038     lea(str2, Address(str2, cnt2, scale));
4039   } else {
4040     lea(str1, Address(str1, cnt2, scale1));
4041     lea(str2, Address(str2, cnt2, scale2));
4042   }
4043   decrementl(cnt2);  // first character was compared already
4044   negptr(cnt2);
4045 
4046   // Compare the rest of the elements
4047   bind(WHILE_HEAD_LABEL);
4048   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
4049   subl(result, cnt1);
4050   jccb(Assembler::notZero, POP_LABEL);
4051   increment(cnt2);
4052   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
4053 
4054   // Strings are equal up to min length.  Return the length difference.
4055   bind(LENGTH_DIFF_LABEL);
4056   pop(result);
4057   if (ae == StrIntrinsicNode::UU) {
4058     // Divide diff by 2 to get number of chars
4059     sarl(result, 1);
4060   }
4061   jmpb(DONE_LABEL);
4062 
4063 #ifdef _LP64
4064   if (VM_Version::supports_avx512vlbw()) {
4065 
4066     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
4067 
4068     kmovql(cnt1, mask);
4069     notq(cnt1);
4070     bsfq(cnt2, cnt1);
4071     if (ae != StrIntrinsicNode::LL) {
4072       // Divide diff by 2 to get number of chars
4073       sarl(cnt2, 1);
4074     }
4075     addq(result, cnt2);
4076     if (ae == StrIntrinsicNode::LL) {
4077       load_unsigned_byte(cnt1, Address(str2, result));
4078       load_unsigned_byte(result, Address(str1, result));
4079     } else if (ae == StrIntrinsicNode::UU) {
4080       load_unsigned_short(cnt1, Address(str2, result, scale));
4081       load_unsigned_short(result, Address(str1, result, scale));
4082     } else {
4083       load_unsigned_short(cnt1, Address(str2, result, scale2));
4084       load_unsigned_byte(result, Address(str1, result, scale1));
4085     }
4086     subl(result, cnt1);
4087     jmpb(POP_LABEL);
4088   }//if (VM_Version::supports_avx512vlbw())
4089 #endif // _LP64
4090 
4091   // Discard the stored length difference
4092   bind(POP_LABEL);
4093   pop(cnt1);
4094 
4095   // That's it
4096   bind(DONE_LABEL);
4097   if(ae == StrIntrinsicNode::UL) {
4098     negl(result);
4099   }
4100 
4101 }
4102 
4103 // Search for Non-ASCII character (Negative byte value) in a byte array,
4104 // return the index of the first such character, otherwise the length
4105 // of the array segment searched.
4106 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4107 //   @IntrinsicCandidate
4108 //   public static int countPositives(byte[] ba, int off, int len) {
4109 //     for (int i = off; i < off + len; i++) {
4110 //       if (ba[i] < 0) {
4111 //         return i - off;
4112 //       }
4113 //     }
4114 //     return len;
4115 //   }
4116 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4117   Register result, Register tmp1,
4118   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4119   // rsi: byte array
4120   // rcx: len
4121   // rax: result
4122   ShortBranchVerifier sbv(this);
4123   assert_different_registers(ary1, len, result, tmp1);
4124   assert_different_registers(vec1, vec2);
4125   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4126 
4127   movl(result, len); // copy
4128   // len == 0
4129   testl(len, len);
4130   jcc(Assembler::zero, DONE);
4131 
4132   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4133     VM_Version::supports_avx512vlbw() &&
4134     VM_Version::supports_bmi2()) {
4135 
4136     Label test_64_loop, test_tail, BREAK_LOOP;
4137     movl(tmp1, len);
4138     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4139 
4140     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4141     andl(len,  0xffffffc0); // vector count (in chars)
4142     jccb(Assembler::zero, test_tail);
4143 
4144     lea(ary1, Address(ary1, len, Address::times_1));
4145     negptr(len);
4146 
4147     bind(test_64_loop);
4148     // Check whether our 64 elements of size byte contain negatives
4149     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4150     kortestql(mask1, mask1);
4151     jcc(Assembler::notZero, BREAK_LOOP);
4152 
4153     addptr(len, 64);
4154     jccb(Assembler::notZero, test_64_loop);
4155 
4156     bind(test_tail);
4157     // bail out when there is nothing to be done
4158     testl(tmp1, -1);
4159     jcc(Assembler::zero, DONE);
4160 
4161 
4162     // check the tail for absense of negatives
4163     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4164 #ifdef _LP64
4165     {
4166       Register tmp3_aliased = len;
4167       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4168       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4169       notq(tmp3_aliased);
4170       kmovql(mask2, tmp3_aliased);
4171     }
4172 #else
4173     Label k_init;
4174     jmp(k_init);
4175 
4176     // We could not read 64-bits from a general purpose register thus we move
4177     // data required to compose 64 1's to the instruction stream
4178     // We emit 64 byte wide series of elements from 0..63 which later on would
4179     // be used as a compare targets with tail count contained in tmp1 register.
4180     // Result would be a k register having tmp1 consecutive number or 1
4181     // counting from least significant bit.
4182     address tmp = pc();
4183     emit_int64(0x0706050403020100);
4184     emit_int64(0x0F0E0D0C0B0A0908);
4185     emit_int64(0x1716151413121110);
4186     emit_int64(0x1F1E1D1C1B1A1918);
4187     emit_int64(0x2726252423222120);
4188     emit_int64(0x2F2E2D2C2B2A2928);
4189     emit_int64(0x3736353433323130);
4190     emit_int64(0x3F3E3D3C3B3A3938);
4191 
4192     bind(k_init);
4193     lea(len, InternalAddress(tmp));
4194     // create mask to test for negative byte inside a vector
4195     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
4196     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
4197 
4198 #endif
4199     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4200     ktestq(mask1, mask2);
4201     jcc(Assembler::zero, DONE);
4202 
4203     // do a full check for negative registers in the tail
4204     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4205                      // ary1 already pointing to the right place
4206     jmpb(TAIL_START);
4207 
4208     bind(BREAK_LOOP);
4209     // At least one byte in the last 64 byte block was negative.
4210     // Set up to look at the last 64 bytes as if they were a tail
4211     lea(ary1, Address(ary1, len, Address::times_1));
4212     addptr(result, len);
4213     // Ignore the very last byte: if all others are positive,
4214     // it must be negative, so we can skip right to the 2+1 byte
4215     // end comparison at this point
4216     orl(result, 63);
4217     movl(len, 63);
4218     // Fallthru to tail compare
4219   } else {
4220 
4221     if (UseAVX >= 2 && UseSSE >= 2) {
4222       // With AVX2, use 32-byte vector compare
4223       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4224 
4225       // Compare 32-byte vectors
4226       testl(len, 0xffffffe0);   // vector count (in bytes)
4227       jccb(Assembler::zero, TAIL_START);
4228 
4229       andl(len, 0xffffffe0);
4230       lea(ary1, Address(ary1, len, Address::times_1));
4231       negptr(len);
4232 
4233       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4234       movdl(vec2, tmp1);
4235       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4236 
4237       bind(COMPARE_WIDE_VECTORS);
4238       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4239       vptest(vec1, vec2);
4240       jccb(Assembler::notZero, BREAK_LOOP);
4241       addptr(len, 32);
4242       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4243 
4244       testl(result, 0x0000001f);   // any bytes remaining?
4245       jcc(Assembler::zero, DONE);
4246 
4247       // Quick test using the already prepared vector mask
4248       movl(len, result);
4249       andl(len, 0x0000001f);
4250       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4251       vptest(vec1, vec2);
4252       jcc(Assembler::zero, DONE);
4253       // There are zeros, jump to the tail to determine exactly where
4254       jmpb(TAIL_START);
4255 
4256       bind(BREAK_LOOP);
4257       // At least one byte in the last 32-byte vector is negative.
4258       // Set up to look at the last 32 bytes as if they were a tail
4259       lea(ary1, Address(ary1, len, Address::times_1));
4260       addptr(result, len);
4261       // Ignore the very last byte: if all others are positive,
4262       // it must be negative, so we can skip right to the 2+1 byte
4263       // end comparison at this point
4264       orl(result, 31);
4265       movl(len, 31);
4266       // Fallthru to tail compare
4267     } else if (UseSSE42Intrinsics) {
4268       // With SSE4.2, use double quad vector compare
4269       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4270 
4271       // Compare 16-byte vectors
4272       testl(len, 0xfffffff0);   // vector count (in bytes)
4273       jcc(Assembler::zero, TAIL_START);
4274 
4275       andl(len, 0xfffffff0);
4276       lea(ary1, Address(ary1, len, Address::times_1));
4277       negptr(len);
4278 
4279       movl(tmp1, 0x80808080);
4280       movdl(vec2, tmp1);
4281       pshufd(vec2, vec2, 0);
4282 
4283       bind(COMPARE_WIDE_VECTORS);
4284       movdqu(vec1, Address(ary1, len, Address::times_1));
4285       ptest(vec1, vec2);
4286       jccb(Assembler::notZero, BREAK_LOOP);
4287       addptr(len, 16);
4288       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4289 
4290       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4291       jcc(Assembler::zero, DONE);
4292 
4293       // Quick test using the already prepared vector mask
4294       movl(len, result);
4295       andl(len, 0x0000000f);   // tail count (in bytes)
4296       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4297       ptest(vec1, vec2);
4298       jcc(Assembler::zero, DONE);
4299       jmpb(TAIL_START);
4300 
4301       bind(BREAK_LOOP);
4302       // At least one byte in the last 16-byte vector is negative.
4303       // Set up and look at the last 16 bytes as if they were a tail
4304       lea(ary1, Address(ary1, len, Address::times_1));
4305       addptr(result, len);
4306       // Ignore the very last byte: if all others are positive,
4307       // it must be negative, so we can skip right to the 2+1 byte
4308       // end comparison at this point
4309       orl(result, 15);
4310       movl(len, 15);
4311       // Fallthru to tail compare
4312     }
4313   }
4314 
4315   bind(TAIL_START);
4316   // Compare 4-byte vectors
4317   andl(len, 0xfffffffc); // vector count (in bytes)
4318   jccb(Assembler::zero, COMPARE_CHAR);
4319 
4320   lea(ary1, Address(ary1, len, Address::times_1));
4321   negptr(len);
4322 
4323   bind(COMPARE_VECTORS);
4324   movl(tmp1, Address(ary1, len, Address::times_1));
4325   andl(tmp1, 0x80808080);
4326   jccb(Assembler::notZero, TAIL_ADJUST);
4327   addptr(len, 4);
4328   jccb(Assembler::notZero, COMPARE_VECTORS);
4329 
4330   // Compare trailing char (final 2-3 bytes), if any
4331   bind(COMPARE_CHAR);
4332 
4333   testl(result, 0x2);   // tail  char
4334   jccb(Assembler::zero, COMPARE_BYTE);
4335   load_unsigned_short(tmp1, Address(ary1, 0));
4336   andl(tmp1, 0x00008080);
4337   jccb(Assembler::notZero, CHAR_ADJUST);
4338   lea(ary1, Address(ary1, 2));
4339 
4340   bind(COMPARE_BYTE);
4341   testl(result, 0x1);   // tail  byte
4342   jccb(Assembler::zero, DONE);
4343   load_unsigned_byte(tmp1, Address(ary1, 0));
4344   testl(tmp1, 0x00000080);
4345   jccb(Assembler::zero, DONE);
4346   subptr(result, 1);
4347   jmpb(DONE);
4348 
4349   bind(TAIL_ADJUST);
4350   // there are negative bits in the last 4 byte block.
4351   // Adjust result and check the next three bytes
4352   addptr(result, len);
4353   orl(result, 3);
4354   lea(ary1, Address(ary1, len, Address::times_1));
4355   jmpb(COMPARE_CHAR);
4356 
4357   bind(CHAR_ADJUST);
4358   // We are looking at a char + optional byte tail, and found that one
4359   // of the bytes in the char is negative. Adjust the result, check the
4360   // first byte and readjust if needed.
4361   andl(result, 0xfffffffc);
4362   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4363   jccb(Assembler::notZero, DONE);
4364   addptr(result, 1);
4365 
4366   // That's it
4367   bind(DONE);
4368   if (UseAVX >= 2 && UseSSE >= 2) {
4369     // clean upper bits of YMM registers
4370     vpxor(vec1, vec1);
4371     vpxor(vec2, vec2);
4372   }
4373 }
4374 
4375 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4376 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4377                                       Register limit, Register result, Register chr,
4378                                       XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
4379   ShortBranchVerifier sbv(this);
4380   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4381 
4382   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4383   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4384 
4385   if (is_array_equ) {
4386     // Check the input args
4387     cmpoop(ary1, ary2);
4388     jcc(Assembler::equal, TRUE_LABEL);
4389 
4390     // Need additional checks for arrays_equals.
4391     testptr(ary1, ary1);
4392     jcc(Assembler::zero, FALSE_LABEL);
4393     testptr(ary2, ary2);
4394     jcc(Assembler::zero, FALSE_LABEL);
4395 
4396     // Check the lengths
4397     movl(limit, Address(ary1, length_offset));
4398     cmpl(limit, Address(ary2, length_offset));
4399     jcc(Assembler::notEqual, FALSE_LABEL);
4400   }
4401 
4402   // count == 0
4403   testl(limit, limit);
4404   jcc(Assembler::zero, TRUE_LABEL);
4405 
4406   if (is_array_equ) {
4407     // Load array address
4408     lea(ary1, Address(ary1, base_offset));
4409     lea(ary2, Address(ary2, base_offset));
4410   }
4411 
4412   if (is_array_equ && is_char) {
4413     // arrays_equals when used for char[].
4414     shll(limit, 1);      // byte count != 0
4415   }
4416   movl(result, limit); // copy
4417 
4418   if (UseAVX >= 2) {
4419     // With AVX2, use 32-byte vector compare
4420     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4421 
4422     // Compare 32-byte vectors
4423     andl(result, 0x0000001f);  //   tail count (in bytes)
4424     andl(limit, 0xffffffe0);   // vector count (in bytes)
4425     jcc(Assembler::zero, COMPARE_TAIL);
4426 
4427     lea(ary1, Address(ary1, limit, Address::times_1));
4428     lea(ary2, Address(ary2, limit, Address::times_1));
4429     negptr(limit);
4430 
4431 #ifdef _LP64
4432     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4433       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4434 
4435       cmpl(limit, -64);
4436       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4437 
4438       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4439 
4440       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4441       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4442       kortestql(mask, mask);
4443       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4444       addptr(limit, 64);  // update since we already compared at this addr
4445       cmpl(limit, -64);
4446       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4447 
4448       // At this point we may still need to compare -limit+result bytes.
4449       // We could execute the next two instruction and just continue via non-wide path:
4450       //  cmpl(limit, 0);
4451       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4452       // But since we stopped at the points ary{1,2}+limit which are
4453       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4454       // (|limit| <= 32 and result < 32),
4455       // we may just compare the last 64 bytes.
4456       //
4457       addptr(result, -64);   // it is safe, bc we just came from this area
4458       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4459       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4460       kortestql(mask, mask);
4461       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4462 
4463       jmp(TRUE_LABEL);
4464 
4465       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4466 
4467     }//if (VM_Version::supports_avx512vlbw())
4468 #endif //_LP64
4469     bind(COMPARE_WIDE_VECTORS);
4470     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
4471     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4472     vpxor(vec1, vec2);
4473 
4474     vptest(vec1, vec1);
4475     jcc(Assembler::notZero, FALSE_LABEL);
4476     addptr(limit, 32);
4477     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4478 
4479     testl(result, result);
4480     jcc(Assembler::zero, TRUE_LABEL);
4481 
4482     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
4483     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4484     vpxor(vec1, vec2);
4485 
4486     vptest(vec1, vec1);
4487     jccb(Assembler::notZero, FALSE_LABEL);
4488     jmpb(TRUE_LABEL);
4489 
4490     bind(COMPARE_TAIL); // limit is zero
4491     movl(limit, result);
4492     // Fallthru to tail compare
4493   } else if (UseSSE42Intrinsics) {
4494     // With SSE4.2, use double quad vector compare
4495     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4496 
4497     // Compare 16-byte vectors
4498     andl(result, 0x0000000f);  //   tail count (in bytes)
4499     andl(limit, 0xfffffff0);   // vector count (in bytes)
4500     jcc(Assembler::zero, COMPARE_TAIL);
4501 
4502     lea(ary1, Address(ary1, limit, Address::times_1));
4503     lea(ary2, Address(ary2, limit, Address::times_1));
4504     negptr(limit);
4505 
4506     bind(COMPARE_WIDE_VECTORS);
4507     movdqu(vec1, Address(ary1, limit, Address::times_1));
4508     movdqu(vec2, Address(ary2, limit, Address::times_1));
4509     pxor(vec1, vec2);
4510 
4511     ptest(vec1, vec1);
4512     jcc(Assembler::notZero, FALSE_LABEL);
4513     addptr(limit, 16);
4514     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4515 
4516     testl(result, result);
4517     jcc(Assembler::zero, TRUE_LABEL);
4518 
4519     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4520     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4521     pxor(vec1, vec2);
4522 
4523     ptest(vec1, vec1);
4524     jccb(Assembler::notZero, FALSE_LABEL);
4525     jmpb(TRUE_LABEL);
4526 
4527     bind(COMPARE_TAIL); // limit is zero
4528     movl(limit, result);
4529     // Fallthru to tail compare
4530   }
4531 
4532   // Compare 4-byte vectors
4533   andl(limit, 0xfffffffc); // vector count (in bytes)
4534   jccb(Assembler::zero, COMPARE_CHAR);
4535 
4536   lea(ary1, Address(ary1, limit, Address::times_1));
4537   lea(ary2, Address(ary2, limit, Address::times_1));
4538   negptr(limit);
4539 
4540   bind(COMPARE_VECTORS);
4541   movl(chr, Address(ary1, limit, Address::times_1));
4542   cmpl(chr, Address(ary2, limit, Address::times_1));
4543   jccb(Assembler::notEqual, FALSE_LABEL);
4544   addptr(limit, 4);
4545   jcc(Assembler::notZero, COMPARE_VECTORS);
4546 
4547   // Compare trailing char (final 2 bytes), if any
4548   bind(COMPARE_CHAR);
4549   testl(result, 0x2);   // tail  char
4550   jccb(Assembler::zero, COMPARE_BYTE);
4551   load_unsigned_short(chr, Address(ary1, 0));
4552   load_unsigned_short(limit, Address(ary2, 0));
4553   cmpl(chr, limit);
4554   jccb(Assembler::notEqual, FALSE_LABEL);
4555 
4556   if (is_array_equ && is_char) {
4557     bind(COMPARE_BYTE);
4558   } else {
4559     lea(ary1, Address(ary1, 2));
4560     lea(ary2, Address(ary2, 2));
4561 
4562     bind(COMPARE_BYTE);
4563     testl(result, 0x1);   // tail  byte
4564     jccb(Assembler::zero, TRUE_LABEL);
4565     load_unsigned_byte(chr, Address(ary1, 0));
4566     load_unsigned_byte(limit, Address(ary2, 0));
4567     cmpl(chr, limit);
4568     jccb(Assembler::notEqual, FALSE_LABEL);
4569   }
4570   bind(TRUE_LABEL);
4571   movl(result, 1);   // return true
4572   jmpb(DONE);
4573 
4574   bind(FALSE_LABEL);
4575   xorl(result, result); // return false
4576 
4577   // That's it
4578   bind(DONE);
4579   if (UseAVX >= 2) {
4580     // clean upper bits of YMM registers
4581     vpxor(vec1, vec1);
4582     vpxor(vec2, vec2);
4583   }
4584 }
4585 
4586 #ifdef _LP64
4587 
4588 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4589 #define __ masm.
4590   Register dst = stub.data<0>();
4591   XMMRegister src = stub.data<1>();
4592   address target = stub.data<2>();
4593   __ bind(stub.entry());
4594   __ subptr(rsp, 8);
4595   __ movdbl(Address(rsp), src);
4596   __ call(RuntimeAddress(target));
4597   __ pop(dst);
4598   __ jmp(stub.continuation());
4599 #undef __
4600 }
4601 
4602 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4603   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4604   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4605 
4606   address slowpath_target;
4607   if (dst_bt == T_INT) {
4608     if (src_bt == T_FLOAT) {
4609       cvttss2sil(dst, src);
4610       cmpl(dst, 0x80000000);
4611       slowpath_target = StubRoutines::x86::f2i_fixup();
4612     } else {
4613       cvttsd2sil(dst, src);
4614       cmpl(dst, 0x80000000);
4615       slowpath_target = StubRoutines::x86::d2i_fixup();
4616     }
4617   } else {
4618     if (src_bt == T_FLOAT) {
4619       cvttss2siq(dst, src);
4620       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4621       slowpath_target = StubRoutines::x86::f2l_fixup();
4622     } else {
4623       cvttsd2siq(dst, src);
4624       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4625       slowpath_target = StubRoutines::x86::d2l_fixup();
4626     }
4627   }
4628 
4629   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4630   jcc(Assembler::equal, stub->entry());
4631   bind(stub->continuation());
4632 }
4633 
4634 #endif // _LP64
4635 
4636 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4637                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4638   switch(ideal_opc) {
4639     case Op_LShiftVS:
4640       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4641     case Op_LShiftVI:
4642       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4643     case Op_LShiftVL:
4644       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4645     case Op_RShiftVS:
4646       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4647     case Op_RShiftVI:
4648       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4649     case Op_RShiftVL:
4650       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4651     case Op_URShiftVS:
4652       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4653     case Op_URShiftVI:
4654       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4655     case Op_URShiftVL:
4656       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4657     case Op_RotateRightV:
4658       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4659     case Op_RotateLeftV:
4660       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4661     default:
4662       fatal("Unsupported masked operation"); break;
4663   }
4664 }
4665 
4666 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4667                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4668                                     bool is_varshift) {
4669   switch (ideal_opc) {
4670     case Op_AddVB:
4671       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4672     case Op_AddVS:
4673       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4674     case Op_AddVI:
4675       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4676     case Op_AddVL:
4677       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4678     case Op_AddVF:
4679       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4680     case Op_AddVD:
4681       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4682     case Op_SubVB:
4683       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4684     case Op_SubVS:
4685       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4686     case Op_SubVI:
4687       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4688     case Op_SubVL:
4689       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4690     case Op_SubVF:
4691       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4692     case Op_SubVD:
4693       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4694     case Op_MulVS:
4695       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4696     case Op_MulVI:
4697       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4698     case Op_MulVL:
4699       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4700     case Op_MulVF:
4701       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4702     case Op_MulVD:
4703       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4704     case Op_DivVF:
4705       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4706     case Op_DivVD:
4707       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4708     case Op_SqrtVF:
4709       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4710     case Op_SqrtVD:
4711       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4712     case Op_AbsVB:
4713       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4714     case Op_AbsVS:
4715       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4716     case Op_AbsVI:
4717       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4718     case Op_AbsVL:
4719       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4720     case Op_FmaVF:
4721       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4722     case Op_FmaVD:
4723       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4724     case Op_VectorRearrange:
4725       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4726     case Op_LShiftVS:
4727       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4728     case Op_LShiftVI:
4729       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4730     case Op_LShiftVL:
4731       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4732     case Op_RShiftVS:
4733       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4734     case Op_RShiftVI:
4735       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4736     case Op_RShiftVL:
4737       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4738     case Op_URShiftVS:
4739       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4740     case Op_URShiftVI:
4741       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4742     case Op_URShiftVL:
4743       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4744     case Op_RotateLeftV:
4745       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4746     case Op_RotateRightV:
4747       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4748     case Op_MaxV:
4749       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4750     case Op_MinV:
4751       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4752     case Op_XorV:
4753       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4754     case Op_OrV:
4755       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4756     case Op_AndV:
4757       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4758     default:
4759       fatal("Unsupported masked operation"); break;
4760   }
4761 }
4762 
4763 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4764                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4765   switch (ideal_opc) {
4766     case Op_AddVB:
4767       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4768     case Op_AddVS:
4769       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4770     case Op_AddVI:
4771       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4772     case Op_AddVL:
4773       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4774     case Op_AddVF:
4775       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4776     case Op_AddVD:
4777       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4778     case Op_SubVB:
4779       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4780     case Op_SubVS:
4781       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4782     case Op_SubVI:
4783       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4784     case Op_SubVL:
4785       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4786     case Op_SubVF:
4787       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4788     case Op_SubVD:
4789       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4790     case Op_MulVS:
4791       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4792     case Op_MulVI:
4793       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4794     case Op_MulVL:
4795       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4796     case Op_MulVF:
4797       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4798     case Op_MulVD:
4799       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4800     case Op_DivVF:
4801       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4802     case Op_DivVD:
4803       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4804     case Op_FmaVF:
4805       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4806     case Op_FmaVD:
4807       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4808     case Op_MaxV:
4809       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4810     case Op_MinV:
4811       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4812     case Op_XorV:
4813       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4814     case Op_OrV:
4815       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4816     case Op_AndV:
4817       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4818     default:
4819       fatal("Unsupported masked operation"); break;
4820   }
4821 }
4822 
4823 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4824                                   KRegister src1, KRegister src2) {
4825   BasicType etype = T_ILLEGAL;
4826   switch(mask_len) {
4827     case 2:
4828     case 4:
4829     case 8:  etype = T_BYTE; break;
4830     case 16: etype = T_SHORT; break;
4831     case 32: etype = T_INT; break;
4832     case 64: etype = T_LONG; break;
4833     default: fatal("Unsupported type"); break;
4834   }
4835   assert(etype != T_ILLEGAL, "");
4836   switch(ideal_opc) {
4837     case Op_AndVMask:
4838       kand(etype, dst, src1, src2); break;
4839     case Op_OrVMask:
4840       kor(etype, dst, src1, src2); break;
4841     case Op_XorVMask:
4842       kxor(etype, dst, src1, src2); break;
4843     default:
4844       fatal("Unsupported masked operation"); break;
4845   }
4846 }
4847 
4848 /*
4849  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4850  * If src is NaN, the result is 0.
4851  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4852  * the result is equal to the value of Integer.MIN_VALUE.
4853  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4854  * the result is equal to the value of Integer.MAX_VALUE.
4855  */
4856 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4857                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4858                                                                    Register rscratch, AddressLiteral float_sign_flip,
4859                                                                    int vec_enc) {
4860   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4861   Label done;
4862   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4863   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4864   vptest(xtmp2, xtmp2, vec_enc);
4865   jccb(Assembler::equal, done);
4866 
4867   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4868   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4869 
4870   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4871   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4872   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4873 
4874   // Recompute the mask for remaining special value.
4875   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4876   // Extract SRC values corresponding to TRUE mask lanes.
4877   vpand(xtmp4, xtmp2, src, vec_enc);
4878   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4879   // values are set.
4880   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4881 
4882   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4883   bind(done);
4884 }
4885 
4886 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4887                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4888                                                                     Register rscratch, AddressLiteral float_sign_flip,
4889                                                                     int vec_enc) {
4890   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4891   Label done;
4892   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4893   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4894   kortestwl(ktmp1, ktmp1);
4895   jccb(Assembler::equal, done);
4896 
4897   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4898   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4899   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4900 
4901   kxorwl(ktmp1, ktmp1, ktmp2);
4902   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4903   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4904   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4905   bind(done);
4906 }
4907 
4908 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4909                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4910                                                                      Register rscratch, AddressLiteral double_sign_flip,
4911                                                                      int vec_enc) {
4912   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4913 
4914   Label done;
4915   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4916   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4917   kortestwl(ktmp1, ktmp1);
4918   jccb(Assembler::equal, done);
4919 
4920   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4921   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4922   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4923 
4924   kxorwl(ktmp1, ktmp1, ktmp2);
4925   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4926   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4927   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4928   bind(done);
4929 }
4930 
4931 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4932                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4933                                                                      Register rscratch, AddressLiteral float_sign_flip,
4934                                                                      int vec_enc) {
4935   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4936   Label done;
4937   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4938   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4939   kortestwl(ktmp1, ktmp1);
4940   jccb(Assembler::equal, done);
4941 
4942   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4943   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4944   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4945 
4946   kxorwl(ktmp1, ktmp1, ktmp2);
4947   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4948   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4949   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4950   bind(done);
4951 }
4952 
4953 /*
4954  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4955  * If src is NaN, the result is 0.
4956  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4957  * the result is equal to the value of Long.MIN_VALUE.
4958  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4959  * the result is equal to the value of Long.MAX_VALUE.
4960  */
4961 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4962                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4963                                                                       Register rscratch, AddressLiteral double_sign_flip,
4964                                                                       int vec_enc) {
4965   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4966 
4967   Label done;
4968   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4969   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4970   kortestwl(ktmp1, ktmp1);
4971   jccb(Assembler::equal, done);
4972 
4973   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4974   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4975   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4976 
4977   kxorwl(ktmp1, ktmp1, ktmp2);
4978   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4979   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4980   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4981   bind(done);
4982 }
4983 
4984 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
4985                                                              XMMRegister xtmp, int index, int vec_enc) {
4986    assert(vec_enc < Assembler::AVX_512bit, "");
4987    if (vec_enc == Assembler::AVX_256bit) {
4988      vextractf128_high(xtmp, src);
4989      vshufps(dst, src, xtmp, index, vec_enc);
4990    } else {
4991      vshufps(dst, src, zero, index, vec_enc);
4992    }
4993 }
4994 
4995 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4996                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
4997                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
4998   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4999 
5000   Label done;
5001   // Compare the destination lanes with float_sign_flip
5002   // value to get mask for all special values.
5003   movdqu(xtmp1, float_sign_flip, rscratch);
5004   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5005   ptest(xtmp2, xtmp2);
5006   jccb(Assembler::equal, done);
5007 
5008   // Flip float_sign_flip to get max integer value.
5009   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5010   pxor(xtmp1, xtmp4);
5011 
5012   // Set detination lanes corresponding to unordered source lanes as zero.
5013   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5014   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5015 
5016   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5017   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5018   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5019 
5020   // Recompute the mask for remaining special value.
5021   pxor(xtmp2, xtmp3);
5022   // Extract mask corresponding to non-negative source lanes.
5023   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5024 
5025   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5026   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5027   pand(xtmp3, xtmp2);
5028 
5029   // Replace destination lanes holding special value(0x80000000) with max int
5030   // if corresponding source lane holds a +ve value.
5031   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5032   bind(done);
5033 }
5034 
5035 
5036 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5037                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5038   switch(to_elem_bt) {
5039     case T_SHORT:
5040       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5041       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5042       vpackusdw(dst, dst, zero, vec_enc);
5043       if (vec_enc == Assembler::AVX_256bit) {
5044         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5045       }
5046       break;
5047     case  T_BYTE:
5048       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5049       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5050       vpackusdw(dst, dst, zero, vec_enc);
5051       if (vec_enc == Assembler::AVX_256bit) {
5052         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5053       }
5054       vpackuswb(dst, dst, zero, vec_enc);
5055       break;
5056     default: assert(false, "%s", type2name(to_elem_bt));
5057   }
5058 }
5059 
5060 /*
5061  * Algorithm for vector D2L and F2I conversions:-
5062  * a) Perform vector D2L/F2I cast.
5063  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5064  *    It signifies that source value could be any of the special floating point
5065  *    values(NaN,-Inf,Inf,Max,-Min).
5066  * c) Set destination to zero if source is NaN value.
5067  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5068  */
5069 
5070 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5071                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5072                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5073   int to_elem_sz = type2aelembytes(to_elem_bt);
5074   assert(to_elem_sz <= 4, "");
5075   vcvttps2dq(dst, src, vec_enc);
5076   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5077   if (to_elem_sz < 4) {
5078     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5079     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5080   }
5081 }
5082 
5083 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5084                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5085                                             Register rscratch, int vec_enc) {
5086   int to_elem_sz = type2aelembytes(to_elem_bt);
5087   assert(to_elem_sz <= 4, "");
5088   vcvttps2dq(dst, src, vec_enc);
5089   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5090   switch(to_elem_bt) {
5091     case T_INT:
5092       break;
5093     case T_SHORT:
5094       evpmovdw(dst, dst, vec_enc);
5095       break;
5096     case T_BYTE:
5097       evpmovdb(dst, dst, vec_enc);
5098       break;
5099     default: assert(false, "%s", type2name(to_elem_bt));
5100   }
5101 }
5102 
5103 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5104                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5105                                             Register rscratch, int vec_enc) {
5106   evcvttps2qq(dst, src, vec_enc);
5107   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5108 }
5109 
5110 // Handling for downcasting from double to integer or sub-word types on AVX2.
5111 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5112                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5113                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5114   int to_elem_sz = type2aelembytes(to_elem_bt);
5115   assert(to_elem_sz < 8, "");
5116   vcvttpd2dq(dst, src, vec_enc);
5117   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5118                                               float_sign_flip, vec_enc);
5119   if (to_elem_sz < 4) {
5120     // xtmp4 holds all zero lanes.
5121     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5122   }
5123 }
5124 
5125 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5126                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5127                                             KRegister ktmp2, AddressLiteral sign_flip,
5128                                             Register rscratch, int vec_enc) {
5129   if (VM_Version::supports_avx512dq()) {
5130     evcvttpd2qq(dst, src, vec_enc);
5131     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5132     switch(to_elem_bt) {
5133       case T_LONG:
5134         break;
5135       case T_INT:
5136         evpmovsqd(dst, dst, vec_enc);
5137         break;
5138       case T_SHORT:
5139         evpmovsqd(dst, dst, vec_enc);
5140         evpmovdw(dst, dst, vec_enc);
5141         break;
5142       case T_BYTE:
5143         evpmovsqd(dst, dst, vec_enc);
5144         evpmovdb(dst, dst, vec_enc);
5145         break;
5146       default: assert(false, "%s", type2name(to_elem_bt));
5147     }
5148   } else {
5149     assert(type2aelembytes(to_elem_bt) <= 4, "");
5150     vcvttpd2dq(dst, src, vec_enc);
5151     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5152     switch(to_elem_bt) {
5153       case T_INT:
5154         break;
5155       case T_SHORT:
5156         evpmovdw(dst, dst, vec_enc);
5157         break;
5158       case T_BYTE:
5159         evpmovdb(dst, dst, vec_enc);
5160         break;
5161       default: assert(false, "%s", type2name(to_elem_bt));
5162     }
5163   }
5164 }
5165 
5166 #ifdef _LP64
5167 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5168                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5169                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5170   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5171   // and re-instantiate original MXCSR.RC mode after that.
5172   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5173 
5174   mov64(tmp, julong_cast(0.5L));
5175   evpbroadcastq(xtmp1, tmp, vec_enc);
5176   vaddpd(xtmp1, src , xtmp1, vec_enc);
5177   evcvtpd2qq(dst, xtmp1, vec_enc);
5178   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5179                                                 double_sign_flip, vec_enc);;
5180 
5181   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5182 }
5183 
5184 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5185                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5186                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5187   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5188   // and re-instantiate original MXCSR.RC mode after that.
5189   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5190 
5191   movl(tmp, jint_cast(0.5));
5192   movq(xtmp1, tmp);
5193   vbroadcastss(xtmp1, xtmp1, vec_enc);
5194   vaddps(xtmp1, src , xtmp1, vec_enc);
5195   vcvtps2dq(dst, xtmp1, vec_enc);
5196   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5197                                               float_sign_flip, vec_enc);
5198 
5199   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5200 }
5201 
5202 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5203                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5204                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5205   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5206   // and re-instantiate original MXCSR.RC mode after that.
5207   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5208 
5209   movl(tmp, jint_cast(0.5));
5210   movq(xtmp1, tmp);
5211   vbroadcastss(xtmp1, xtmp1, vec_enc);
5212   vaddps(xtmp1, src , xtmp1, vec_enc);
5213   vcvtps2dq(dst, xtmp1, vec_enc);
5214   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5215 
5216   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5217 }
5218 #endif // _LP64
5219 
5220 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5221                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5222   switch (from_elem_bt) {
5223     case T_BYTE:
5224       switch (to_elem_bt) {
5225         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5226         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5227         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5228         default: ShouldNotReachHere();
5229       }
5230       break;
5231     case T_SHORT:
5232       switch (to_elem_bt) {
5233         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5234         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5235         default: ShouldNotReachHere();
5236       }
5237       break;
5238     case T_INT:
5239       assert(to_elem_bt == T_LONG, "");
5240       vpmovzxdq(dst, src, vlen_enc);
5241       break;
5242     default:
5243       ShouldNotReachHere();
5244   }
5245 }
5246 
5247 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5248                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5249   switch (from_elem_bt) {
5250     case T_BYTE:
5251       switch (to_elem_bt) {
5252         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5253         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5254         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5255         default: ShouldNotReachHere();
5256       }
5257       break;
5258     case T_SHORT:
5259       switch (to_elem_bt) {
5260         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5261         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5262         default: ShouldNotReachHere();
5263       }
5264       break;
5265     case T_INT:
5266       assert(to_elem_bt == T_LONG, "");
5267       vpmovsxdq(dst, src, vlen_enc);
5268       break;
5269     default:
5270       ShouldNotReachHere();
5271   }
5272 }
5273 
5274 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5275                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5276   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5277   assert(vlen_enc != AVX_512bit, "");
5278 
5279   int dst_bt_size = type2aelembytes(dst_bt);
5280   int src_bt_size = type2aelembytes(src_bt);
5281   if (dst_bt_size > src_bt_size) {
5282     switch (dst_bt_size / src_bt_size) {
5283       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5284       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5285       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5286       default: ShouldNotReachHere();
5287     }
5288   } else {
5289     assert(dst_bt_size < src_bt_size, "");
5290     switch (src_bt_size / dst_bt_size) {
5291       case 2: {
5292         if (vlen_enc == AVX_128bit) {
5293           vpacksswb(dst, src, src, vlen_enc);
5294         } else {
5295           vpacksswb(dst, src, src, vlen_enc);
5296           vpermq(dst, dst, 0x08, vlen_enc);
5297         }
5298         break;
5299       }
5300       case 4: {
5301         if (vlen_enc == AVX_128bit) {
5302           vpackssdw(dst, src, src, vlen_enc);
5303           vpacksswb(dst, dst, dst, vlen_enc);
5304         } else {
5305           vpackssdw(dst, src, src, vlen_enc);
5306           vpermq(dst, dst, 0x08, vlen_enc);
5307           vpacksswb(dst, dst, dst, AVX_128bit);
5308         }
5309         break;
5310       }
5311       case 8: {
5312         if (vlen_enc == AVX_128bit) {
5313           vpshufd(dst, src, 0x08, vlen_enc);
5314           vpackssdw(dst, dst, dst, vlen_enc);
5315           vpacksswb(dst, dst, dst, vlen_enc);
5316         } else {
5317           vpshufd(dst, src, 0x08, vlen_enc);
5318           vpermq(dst, dst, 0x08, vlen_enc);
5319           vpackssdw(dst, dst, dst, AVX_128bit);
5320           vpacksswb(dst, dst, dst, AVX_128bit);
5321         }
5322         break;
5323       }
5324       default: ShouldNotReachHere();
5325     }
5326   }
5327 }
5328 
5329 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5330                                    bool merge, BasicType bt, int vlen_enc) {
5331   if (bt == T_INT) {
5332     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5333   } else {
5334     assert(bt == T_LONG, "");
5335     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5336   }
5337 }
5338 
5339 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5340                                    bool merge, BasicType bt, int vlen_enc) {
5341   if (bt == T_INT) {
5342     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5343   } else {
5344     assert(bt == T_LONG, "");
5345     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5346   }
5347 }
5348 
5349 #ifdef _LP64
5350 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5351                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5352                                                int vec_enc) {
5353   int index = 0;
5354   int vindex = 0;
5355   mov64(rtmp1, 0x0101010101010101L);
5356   pdepq(rtmp1, src, rtmp1);
5357   if (mask_len > 8) {
5358     movq(rtmp2, src);
5359     vpxor(xtmp, xtmp, xtmp, vec_enc);
5360     movq(xtmp, rtmp1);
5361   }
5362   movq(dst, rtmp1);
5363 
5364   mask_len -= 8;
5365   while (mask_len > 0) {
5366     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5367     index++;
5368     if ((index % 2) == 0) {
5369       pxor(xtmp, xtmp);
5370     }
5371     mov64(rtmp1, 0x0101010101010101L);
5372     shrq(rtmp2, 8);
5373     pdepq(rtmp1, rtmp2, rtmp1);
5374     pinsrq(xtmp, rtmp1, index % 2);
5375     vindex = index / 2;
5376     if (vindex) {
5377       // Write entire 16 byte vector when both 64 bit
5378       // lanes are update to save redundant instructions.
5379       if (index % 2) {
5380         vinsertf128(dst, dst, xtmp, vindex);
5381       }
5382     } else {
5383       vmovdqu(dst, xtmp);
5384     }
5385     mask_len -= 8;
5386   }
5387 }
5388 
5389 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5390   switch(opc) {
5391     case Op_VectorMaskTrueCount:
5392       popcntq(dst, tmp);
5393       break;
5394     case Op_VectorMaskLastTrue:
5395       if (VM_Version::supports_lzcnt()) {
5396         lzcntq(tmp, tmp);
5397         movl(dst, 63);
5398         subl(dst, tmp);
5399       } else {
5400         movl(dst, -1);
5401         bsrq(tmp, tmp);
5402         cmov32(Assembler::notZero, dst, tmp);
5403       }
5404       break;
5405     case Op_VectorMaskFirstTrue:
5406       if (VM_Version::supports_bmi1()) {
5407         if (masklen < 32) {
5408           orl(tmp, 1 << masklen);
5409           tzcntl(dst, tmp);
5410         } else if (masklen == 32) {
5411           tzcntl(dst, tmp);
5412         } else {
5413           assert(masklen == 64, "");
5414           tzcntq(dst, tmp);
5415         }
5416       } else {
5417         if (masklen < 32) {
5418           orl(tmp, 1 << masklen);
5419           bsfl(dst, tmp);
5420         } else {
5421           assert(masklen == 32 || masklen == 64, "");
5422           movl(dst, masklen);
5423           if (masklen == 32)  {
5424             bsfl(tmp, tmp);
5425           } else {
5426             bsfq(tmp, tmp);
5427           }
5428           cmov32(Assembler::notZero, dst, tmp);
5429         }
5430       }
5431       break;
5432     case Op_VectorMaskToLong:
5433       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5434       break;
5435     default: assert(false, "Unhandled mask operation");
5436   }
5437 }
5438 
5439 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5440                                               int masklen, int masksize, int vec_enc) {
5441   assert(VM_Version::supports_popcnt(), "");
5442 
5443   if(VM_Version::supports_avx512bw()) {
5444     kmovql(tmp, mask);
5445   } else {
5446     assert(masklen <= 16, "");
5447     kmovwl(tmp, mask);
5448   }
5449 
5450   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5451   // operations needs to be clipped.
5452   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5453     andq(tmp, (1 << masklen) - 1);
5454   }
5455 
5456   vector_mask_operation_helper(opc, dst, tmp, masklen);
5457 }
5458 
5459 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5460                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5461   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5462          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5463   assert(VM_Version::supports_popcnt(), "");
5464 
5465   bool need_clip = false;
5466   switch(bt) {
5467     case T_BOOLEAN:
5468       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5469       vpxor(xtmp, xtmp, xtmp, vec_enc);
5470       vpsubb(xtmp, xtmp, mask, vec_enc);
5471       vpmovmskb(tmp, xtmp, vec_enc);
5472       need_clip = masklen < 16;
5473       break;
5474     case T_BYTE:
5475       vpmovmskb(tmp, mask, vec_enc);
5476       need_clip = masklen < 16;
5477       break;
5478     case T_SHORT:
5479       vpacksswb(xtmp, mask, mask, vec_enc);
5480       if (masklen >= 16) {
5481         vpermpd(xtmp, xtmp, 8, vec_enc);
5482       }
5483       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5484       need_clip = masklen < 16;
5485       break;
5486     case T_INT:
5487     case T_FLOAT:
5488       vmovmskps(tmp, mask, vec_enc);
5489       need_clip = masklen < 4;
5490       break;
5491     case T_LONG:
5492     case T_DOUBLE:
5493       vmovmskpd(tmp, mask, vec_enc);
5494       need_clip = masklen < 2;
5495       break;
5496     default: assert(false, "Unhandled type, %s", type2name(bt));
5497   }
5498 
5499   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5500   // operations needs to be clipped.
5501   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5502     // need_clip implies masklen < 32
5503     andq(tmp, (1 << masklen) - 1);
5504   }
5505 
5506   vector_mask_operation_helper(opc, dst, tmp, masklen);
5507 }
5508 
5509 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5510                                              Register rtmp2, int mask_len) {
5511   kmov(rtmp1, src);
5512   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5513   mov64(rtmp2, -1L);
5514   pextq(rtmp2, rtmp2, rtmp1);
5515   kmov(dst, rtmp2);
5516 }
5517 
5518 #ifdef _LP64
5519 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5520                                                     XMMRegister mask, Register rtmp, Register rscratch,
5521                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5522                                                     int vec_enc) {
5523   assert(type2aelembytes(bt) >= 4, "");
5524   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5525   address compress_perm_table = nullptr;
5526   address expand_perm_table = nullptr;
5527   if (type2aelembytes(bt) == 8) {
5528     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5529     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5530     vmovmskpd(rtmp, mask, vec_enc);
5531   } else {
5532     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5533     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5534     vmovmskps(rtmp, mask, vec_enc);
5535   }
5536   shlq(rtmp, 5); // for 32 byte permute row.
5537   if (opcode == Op_CompressV) {
5538     lea(rscratch, ExternalAddress(compress_perm_table));
5539   } else {
5540     lea(rscratch, ExternalAddress(expand_perm_table));
5541   }
5542   addptr(rtmp, rscratch);
5543   vmovdqu(permv, Address(rtmp));
5544   vpermps(dst, permv, src, Assembler::AVX_256bit);
5545   vpxor(xtmp, xtmp, xtmp, vec_enc);
5546   // Blend the result with zero vector using permute mask, each column entry
5547   // in a permute table row contains either a valid permute index or a -1 (default)
5548   // value, this can potentially be used as a blending mask after
5549   // compressing/expanding the source vector lanes.
5550   vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
5551 }
5552 #endif
5553 
5554 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5555                                                bool merge, BasicType bt, int vec_enc) {
5556   if (opcode == Op_CompressV) {
5557     switch(bt) {
5558     case T_BYTE:
5559       evpcompressb(dst, mask, src, merge, vec_enc);
5560       break;
5561     case T_CHAR:
5562     case T_SHORT:
5563       evpcompressw(dst, mask, src, merge, vec_enc);
5564       break;
5565     case T_INT:
5566       evpcompressd(dst, mask, src, merge, vec_enc);
5567       break;
5568     case T_FLOAT:
5569       evcompressps(dst, mask, src, merge, vec_enc);
5570       break;
5571     case T_LONG:
5572       evpcompressq(dst, mask, src, merge, vec_enc);
5573       break;
5574     case T_DOUBLE:
5575       evcompresspd(dst, mask, src, merge, vec_enc);
5576       break;
5577     default:
5578       fatal("Unsupported type %s", type2name(bt));
5579       break;
5580     }
5581   } else {
5582     assert(opcode == Op_ExpandV, "");
5583     switch(bt) {
5584     case T_BYTE:
5585       evpexpandb(dst, mask, src, merge, vec_enc);
5586       break;
5587     case T_CHAR:
5588     case T_SHORT:
5589       evpexpandw(dst, mask, src, merge, vec_enc);
5590       break;
5591     case T_INT:
5592       evpexpandd(dst, mask, src, merge, vec_enc);
5593       break;
5594     case T_FLOAT:
5595       evexpandps(dst, mask, src, merge, vec_enc);
5596       break;
5597     case T_LONG:
5598       evpexpandq(dst, mask, src, merge, vec_enc);
5599       break;
5600     case T_DOUBLE:
5601       evexpandpd(dst, mask, src, merge, vec_enc);
5602       break;
5603     default:
5604       fatal("Unsupported type %s", type2name(bt));
5605       break;
5606     }
5607   }
5608 }
5609 #endif
5610 
5611 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5612                                            KRegister ktmp1, int vec_enc) {
5613   if (opcode == Op_SignumVD) {
5614     vsubpd(dst, zero, one, vec_enc);
5615     // if src < 0 ? -1 : 1
5616     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5617     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5618     // if src == NaN, -0.0 or 0.0 return src.
5619     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5620     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5621   } else {
5622     assert(opcode == Op_SignumVF, "");
5623     vsubps(dst, zero, one, vec_enc);
5624     // if src < 0 ? -1 : 1
5625     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5626     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5627     // if src == NaN, -0.0 or 0.0 return src.
5628     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5629     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5630   }
5631 }
5632 
5633 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5634                                           XMMRegister xtmp1, int vec_enc) {
5635   if (opcode == Op_SignumVD) {
5636     vsubpd(dst, zero, one, vec_enc);
5637     // if src < 0 ? -1 : 1
5638     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5639     // if src == NaN, -0.0 or 0.0 return src.
5640     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5641     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5642   } else {
5643     assert(opcode == Op_SignumVF, "");
5644     vsubps(dst, zero, one, vec_enc);
5645     // if src < 0 ? -1 : 1
5646     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5647     // if src == NaN, -0.0 or 0.0 return src.
5648     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5649     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5650   }
5651 }
5652 
5653 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5654   if (VM_Version::supports_avx512bw()) {
5655     if (mask_len > 32) {
5656       kmovql(dst, src);
5657     } else {
5658       kmovdl(dst, src);
5659       if (mask_len != 32) {
5660         kshiftrdl(dst, dst, 32 - mask_len);
5661       }
5662     }
5663   } else {
5664     assert(mask_len <= 16, "");
5665     kmovwl(dst, src);
5666     if (mask_len != 16) {
5667       kshiftrwl(dst, dst, 16 - mask_len);
5668     }
5669   }
5670 }
5671 
5672 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5673   int lane_size = type2aelembytes(bt);
5674   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5675   if ((is_LP64 || lane_size < 8) &&
5676       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5677        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5678     movptr(rtmp, imm32);
5679     switch(lane_size) {
5680       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5681       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5682       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5683       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5684       fatal("Unsupported lane size %d", lane_size);
5685       break;
5686     }
5687   } else {
5688     movptr(rtmp, imm32);
5689     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5690     switch(lane_size) {
5691       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5692       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5693       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5694       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5695       fatal("Unsupported lane size %d", lane_size);
5696       break;
5697     }
5698   }
5699 }
5700 
5701 //
5702 // Following is lookup table based popcount computation algorithm:-
5703 //       Index   Bit set count
5704 //     [ 0000 ->   0,
5705 //       0001 ->   1,
5706 //       0010 ->   1,
5707 //       0011 ->   2,
5708 //       0100 ->   1,
5709 //       0101 ->   2,
5710 //       0110 ->   2,
5711 //       0111 ->   3,
5712 //       1000 ->   1,
5713 //       1001 ->   2,
5714 //       1010 ->   3,
5715 //       1011 ->   3,
5716 //       1100 ->   2,
5717 //       1101 ->   3,
5718 //       1111 ->   4 ]
5719 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5720 //     shuffle indices for lookup table access.
5721 //  b. Right shift each byte of vector lane by 4 positions.
5722 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5723 //     shuffle indices for lookup table access.
5724 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5725 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5726 //     count of all the bytes of a quadword.
5727 //  f. Perform step e. for upper 128bit vector lane.
5728 //  g. Pack the bitset count of quadwords back to double word.
5729 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5730 
5731 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5732                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5733   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5734   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5735   vpsrlw(dst, src, 4, vec_enc);
5736   vpand(dst, dst, xtmp1, vec_enc);
5737   vpand(xtmp1, src, xtmp1, vec_enc);
5738   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5739   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5740   vpshufb(dst, xtmp2, dst, vec_enc);
5741   vpaddb(dst, dst, xtmp1, vec_enc);
5742 }
5743 
5744 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5745                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5746   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5747   // Following code is as per steps e,f,g and h of above algorithm.
5748   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5749   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5750   vpsadbw(dst, dst, xtmp2, vec_enc);
5751   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5752   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5753   vpackuswb(dst, xtmp1, dst, vec_enc);
5754 }
5755 
5756 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5757                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5758   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5759   // Add the popcount of upper and lower bytes of word.
5760   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5761   vpsrlw(dst, xtmp1, 8, vec_enc);
5762   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5763   vpaddw(dst, dst, xtmp1, vec_enc);
5764 }
5765 
5766 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5767                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5768   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5769   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5770   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5771 }
5772 
5773 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5774                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5775   switch(bt) {
5776     case T_LONG:
5777       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5778       break;
5779     case T_INT:
5780       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5781       break;
5782     case T_CHAR:
5783     case T_SHORT:
5784       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5785       break;
5786     case T_BYTE:
5787     case T_BOOLEAN:
5788       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5789       break;
5790     default:
5791       fatal("Unsupported type %s", type2name(bt));
5792       break;
5793   }
5794 }
5795 
5796 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5797                                                       KRegister mask, bool merge, int vec_enc) {
5798   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5799   switch(bt) {
5800     case T_LONG:
5801       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5802       evpopcntq(dst, mask, src, merge, vec_enc);
5803       break;
5804     case T_INT:
5805       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5806       evpopcntd(dst, mask, src, merge, vec_enc);
5807       break;
5808     case T_CHAR:
5809     case T_SHORT:
5810       assert(VM_Version::supports_avx512_bitalg(), "");
5811       evpopcntw(dst, mask, src, merge, vec_enc);
5812       break;
5813     case T_BYTE:
5814     case T_BOOLEAN:
5815       assert(VM_Version::supports_avx512_bitalg(), "");
5816       evpopcntb(dst, mask, src, merge, vec_enc);
5817       break;
5818     default:
5819       fatal("Unsupported type %s", type2name(bt));
5820       break;
5821   }
5822 }
5823 
5824 #ifndef _LP64
5825 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
5826   assert(VM_Version::supports_avx512bw(), "");
5827   kmovdl(tmp, src);
5828   kunpckdql(dst, tmp, tmp);
5829 }
5830 #endif
5831 
5832 // Bit reversal algorithm first reverses the bits of each byte followed by
5833 // a byte level reversal for multi-byte primitive types (short/int/long).
5834 // Algorithm performs a lookup table access to get reverse bit sequence
5835 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5836 // is obtained by swapping the reverse bit sequences of upper and lower
5837 // nibble of a byte.
5838 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5839                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5840   if (VM_Version::supports_avx512vlbw()) {
5841 
5842     // Get the reverse bit sequence of lower nibble of each byte.
5843     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5844     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5845     evpandq(dst, xtmp2, src, vec_enc);
5846     vpshufb(dst, xtmp1, dst, vec_enc);
5847     vpsllq(dst, dst, 4, vec_enc);
5848 
5849     // Get the reverse bit sequence of upper nibble of each byte.
5850     vpandn(xtmp2, xtmp2, src, vec_enc);
5851     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5852     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5853 
5854     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5855     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5856     evporq(xtmp2, dst, xtmp2, vec_enc);
5857     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5858 
5859   } else if(vec_enc == Assembler::AVX_512bit) {
5860     // Shift based bit reversal.
5861     assert(bt == T_LONG || bt == T_INT, "");
5862 
5863     // Swap lower and upper nibble of each byte.
5864     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5865 
5866     // Swap two least and most significant bits of each nibble.
5867     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5868 
5869     // Swap adjacent pair of bits.
5870     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5871     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5872 
5873     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5874     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5875   } else {
5876     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5877     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5878 
5879     // Get the reverse bit sequence of lower nibble of each byte.
5880     vpand(dst, xtmp2, src, vec_enc);
5881     vpshufb(dst, xtmp1, dst, vec_enc);
5882     vpsllq(dst, dst, 4, vec_enc);
5883 
5884     // Get the reverse bit sequence of upper nibble of each byte.
5885     vpandn(xtmp2, xtmp2, src, vec_enc);
5886     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5887     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5888 
5889     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5890     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5891     vpor(xtmp2, dst, xtmp2, vec_enc);
5892     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5893   }
5894 }
5895 
5896 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5897                                                 XMMRegister xtmp, Register rscratch) {
5898   assert(VM_Version::supports_gfni(), "");
5899   assert(rscratch != noreg || always_reachable(mask), "missing");
5900 
5901   // Galois field instruction based bit reversal based on following algorithm.
5902   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5903   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5904   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5905   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5906 }
5907 
5908 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5909                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5910   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5911   evpandq(dst, xtmp1, src, vec_enc);
5912   vpsllq(dst, dst, nbits, vec_enc);
5913   vpandn(xtmp1, xtmp1, src, vec_enc);
5914   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5915   evporq(dst, dst, xtmp1, vec_enc);
5916 }
5917 
5918 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5919                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5920   // Shift based bit reversal.
5921   assert(VM_Version::supports_evex(), "");
5922   switch(bt) {
5923     case T_LONG:
5924       // Swap upper and lower double word of each quad word.
5925       evprorq(xtmp1, k0, src, 32, true, vec_enc);
5926       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5927       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5928       break;
5929     case T_INT:
5930       // Swap upper and lower word of each double word.
5931       evprord(xtmp1, k0, src, 16, true, vec_enc);
5932       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5933       break;
5934     case T_CHAR:
5935     case T_SHORT:
5936       // Swap upper and lower byte of each word.
5937       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
5938       break;
5939     case T_BYTE:
5940       evmovdquq(dst, k0, src, true, vec_enc);
5941       break;
5942     default:
5943       fatal("Unsupported type %s", type2name(bt));
5944       break;
5945   }
5946 }
5947 
5948 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5949   if (bt == T_BYTE) {
5950     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
5951       evmovdquq(dst, k0, src, true, vec_enc);
5952     } else {
5953       vmovdqu(dst, src);
5954     }
5955     return;
5956   }
5957   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
5958   // pre-computed shuffle indices.
5959   switch(bt) {
5960     case T_LONG:
5961       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
5962       break;
5963     case T_INT:
5964       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
5965       break;
5966     case T_CHAR:
5967     case T_SHORT:
5968       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
5969       break;
5970     default:
5971       fatal("Unsupported type %s", type2name(bt));
5972       break;
5973   }
5974   vpshufb(dst, src, dst, vec_enc);
5975 }
5976 
5977 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5978                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5979                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
5980   assert(is_integral_type(bt), "");
5981   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5982   assert(VM_Version::supports_avx512cd(), "");
5983   switch(bt) {
5984     case T_LONG:
5985       evplzcntq(dst, ktmp, src, merge, vec_enc);
5986       break;
5987     case T_INT:
5988       evplzcntd(dst, ktmp, src, merge, vec_enc);
5989       break;
5990     case T_SHORT:
5991       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
5992       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
5993       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
5994       vpunpckhwd(dst, xtmp1, src, vec_enc);
5995       evplzcntd(dst, ktmp, dst, merge, vec_enc);
5996       vpackusdw(dst, xtmp2, dst, vec_enc);
5997       break;
5998     case T_BYTE:
5999       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6000       // accessing the lookup table.
6001       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6002       // accessing the lookup table.
6003       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6004       assert(VM_Version::supports_avx512bw(), "");
6005       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6006       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6007       vpand(xtmp2, dst, src, vec_enc);
6008       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6009       vpsrlw(xtmp3, src, 4, vec_enc);
6010       vpand(xtmp3, dst, xtmp3, vec_enc);
6011       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6012       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6013       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6014       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6015       break;
6016     default:
6017       fatal("Unsupported type %s", type2name(bt));
6018       break;
6019   }
6020 }
6021 
6022 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6023                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6024   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6025   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6026   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6027   // accessing the lookup table.
6028   vpand(dst, xtmp2, src, vec_enc);
6029   vpshufb(dst, xtmp1, dst, vec_enc);
6030   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6031   // accessing the lookup table.
6032   vpsrlw(xtmp3, src, 4, vec_enc);
6033   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6034   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6035   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6036   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6037   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6038   vpaddb(dst, dst, xtmp2, vec_enc);
6039   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6040 }
6041 
6042 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6043                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6044   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6045   // Add zero counts of lower byte and upper byte of a word if
6046   // upper byte holds a zero value.
6047   vpsrlw(xtmp3, src, 8, vec_enc);
6048   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6049   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6050   vpsllw(xtmp2, dst, 8, vec_enc);
6051   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6052   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6053   vpsrlw(dst, dst, 8, vec_enc);
6054 }
6055 
6056 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6057                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6058   // Since IEEE 754 floating point format represents mantissa in 1.0 format
6059   // hence biased exponent can be used to compute leading zero count as per
6060   // following formula:-
6061   // LZCNT = 32 - (biased_exp - 127)
6062   // Special handling has been introduced for Zero, Max_Int and -ve source values.
6063 
6064   // Broadcast 0xFF
6065   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6066   vpsrld(xtmp1, xtmp1, 24, vec_enc);
6067 
6068   // Extract biased exponent.
6069   vcvtdq2ps(dst, src, vec_enc);
6070   vpsrld(dst, dst, 23, vec_enc);
6071   vpand(dst, dst, xtmp1, vec_enc);
6072 
6073   // Broadcast 127.
6074   vpsrld(xtmp1, xtmp1, 1, vec_enc);
6075   // Exponent = biased_exp - 127
6076   vpsubd(dst, dst, xtmp1, vec_enc);
6077 
6078   // Exponent = Exponent  + 1
6079   vpsrld(xtmp3, xtmp1, 6, vec_enc);
6080   vpaddd(dst, dst, xtmp3, vec_enc);
6081 
6082   // Replace -ve exponent with zero, exponent is -ve when src
6083   // lane contains a zero value.
6084   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6085   vblendvps(dst, dst, xtmp2, dst, vec_enc);
6086 
6087   // Rematerialize broadcast 32.
6088   vpslld(xtmp1, xtmp3, 5, vec_enc);
6089   // Exponent is 32 if corresponding source lane contains max_int value.
6090   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
6091   // LZCNT = 32 - exponent
6092   vpsubd(dst, xtmp1, dst, vec_enc);
6093 
6094   // Replace LZCNT with a value 1 if corresponding source lane
6095   // contains max_int value.
6096   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
6097 
6098   // Replace biased_exp with 0 if source lane value is less than zero.
6099   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6100   vblendvps(dst, dst, xtmp2, src, vec_enc);
6101 }
6102 
6103 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6104                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6105   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6106   // Add zero counts of lower word and upper word of a double word if
6107   // upper word holds a zero value.
6108   vpsrld(xtmp3, src, 16, vec_enc);
6109   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6110   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6111   vpslld(xtmp2, dst, 16, vec_enc);
6112   vpaddd(xtmp2, xtmp2, dst, vec_enc);
6113   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6114   vpsrld(dst, dst, 16, vec_enc);
6115   // Add zero counts of lower doubleword and upper doubleword of a
6116   // quadword if upper doubleword holds a zero value.
6117   vpsrlq(xtmp3, src, 32, vec_enc);
6118   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6119   vpsllq(xtmp2, dst, 32, vec_enc);
6120   vpaddq(xtmp2, xtmp2, dst, vec_enc);
6121   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6122   vpsrlq(dst, dst, 32, vec_enc);
6123 }
6124 
6125 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6126                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6127                                                        Register rtmp, int vec_enc) {
6128   assert(is_integral_type(bt), "unexpected type");
6129   assert(vec_enc < Assembler::AVX_512bit, "");
6130   switch(bt) {
6131     case T_LONG:
6132       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6133       break;
6134     case T_INT:
6135       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6136       break;
6137     case T_SHORT:
6138       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6139       break;
6140     case T_BYTE:
6141       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6142       break;
6143     default:
6144       fatal("Unsupported type %s", type2name(bt));
6145       break;
6146   }
6147 }
6148 
6149 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6150   switch(bt) {
6151     case T_BYTE:
6152       vpsubb(dst, src1, src2, vec_enc);
6153       break;
6154     case T_SHORT:
6155       vpsubw(dst, src1, src2, vec_enc);
6156       break;
6157     case T_INT:
6158       vpsubd(dst, src1, src2, vec_enc);
6159       break;
6160     case T_LONG:
6161       vpsubq(dst, src1, src2, vec_enc);
6162       break;
6163     default:
6164       fatal("Unsupported type %s", type2name(bt));
6165       break;
6166   }
6167 }
6168 
6169 // Trailing zero count computation is based on leading zero count operation as per
6170 // following equation. All AVX3 targets support AVX512CD feature which offers
6171 // direct vector instruction to compute leading zero count.
6172 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6173 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6174                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6175                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6176   assert(is_integral_type(bt), "");
6177   // xtmp = -1
6178   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6179   // xtmp = xtmp + src
6180   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6181   // xtmp = xtmp & ~src
6182   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6183   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6184   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6185   vpsub(bt, dst, xtmp4, dst, vec_enc);
6186 }
6187 
6188 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6189 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6190 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6191                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6192   assert(is_integral_type(bt), "");
6193   // xtmp = 0
6194   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6195   // xtmp = 0 - src
6196   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6197   // xtmp = xtmp | src
6198   vpor(xtmp3, xtmp3, src, vec_enc);
6199   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6200   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6201   vpsub(bt, dst, xtmp1, dst, vec_enc);
6202 }
6203 
6204 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6205   Label done;
6206   Label neg_divisor_fastpath;
6207   cmpl(divisor, 0);
6208   jccb(Assembler::less, neg_divisor_fastpath);
6209   xorl(rdx, rdx);
6210   divl(divisor);
6211   jmpb(done);
6212   bind(neg_divisor_fastpath);
6213   // Fastpath for divisor < 0:
6214   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6215   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6216   movl(rdx, rax);
6217   subl(rdx, divisor);
6218   if (VM_Version::supports_bmi1()) {
6219     andnl(rax, rdx, rax);
6220   } else {
6221     notl(rdx);
6222     andl(rax, rdx);
6223   }
6224   shrl(rax, 31);
6225   bind(done);
6226 }
6227 
6228 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6229   Label done;
6230   Label neg_divisor_fastpath;
6231   cmpl(divisor, 0);
6232   jccb(Assembler::less, neg_divisor_fastpath);
6233   xorl(rdx, rdx);
6234   divl(divisor);
6235   jmpb(done);
6236   bind(neg_divisor_fastpath);
6237   // Fastpath when divisor < 0:
6238   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6239   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6240   movl(rdx, rax);
6241   subl(rax, divisor);
6242   if (VM_Version::supports_bmi1()) {
6243     andnl(rax, rax, rdx);
6244   } else {
6245     notl(rax);
6246     andl(rax, rdx);
6247   }
6248   sarl(rax, 31);
6249   andl(rax, divisor);
6250   subl(rdx, rax);
6251   bind(done);
6252 }
6253 
6254 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6255   Label done;
6256   Label neg_divisor_fastpath;
6257 
6258   cmpl(divisor, 0);
6259   jccb(Assembler::less, neg_divisor_fastpath);
6260   xorl(rdx, rdx);
6261   divl(divisor);
6262   jmpb(done);
6263   bind(neg_divisor_fastpath);
6264   // Fastpath for divisor < 0:
6265   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6266   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6267   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6268   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6269   movl(rdx, rax);
6270   subl(rax, divisor);
6271   if (VM_Version::supports_bmi1()) {
6272     andnl(rax, rax, rdx);
6273   } else {
6274     notl(rax);
6275     andl(rax, rdx);
6276   }
6277   movl(tmp, rax);
6278   shrl(rax, 31); // quotient
6279   sarl(tmp, 31);
6280   andl(tmp, divisor);
6281   subl(rdx, tmp); // remainder
6282   bind(done);
6283 }
6284 
6285 #ifdef _LP64
6286 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6287                                  XMMRegister xtmp2, Register rtmp) {
6288   if(VM_Version::supports_gfni()) {
6289     // Galois field instruction based bit reversal based on following algorithm.
6290     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6291     mov64(rtmp, 0x8040201008040201L);
6292     movq(xtmp1, src);
6293     movq(xtmp2, rtmp);
6294     gf2p8affineqb(xtmp1, xtmp2, 0);
6295     movq(dst, xtmp1);
6296   } else {
6297     // Swap even and odd numbered bits.
6298     movl(rtmp, src);
6299     andl(rtmp, 0x55555555);
6300     shll(rtmp, 1);
6301     movl(dst, src);
6302     andl(dst, 0xAAAAAAAA);
6303     shrl(dst, 1);
6304     orl(dst, rtmp);
6305 
6306     // Swap LSB and MSB 2 bits of each nibble.
6307     movl(rtmp, dst);
6308     andl(rtmp, 0x33333333);
6309     shll(rtmp, 2);
6310     andl(dst, 0xCCCCCCCC);
6311     shrl(dst, 2);
6312     orl(dst, rtmp);
6313 
6314     // Swap LSB and MSB 4 bits of each byte.
6315     movl(rtmp, dst);
6316     andl(rtmp, 0x0F0F0F0F);
6317     shll(rtmp, 4);
6318     andl(dst, 0xF0F0F0F0);
6319     shrl(dst, 4);
6320     orl(dst, rtmp);
6321   }
6322   bswapl(dst);
6323 }
6324 
6325 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6326                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6327   if(VM_Version::supports_gfni()) {
6328     // Galois field instruction based bit reversal based on following algorithm.
6329     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6330     mov64(rtmp1, 0x8040201008040201L);
6331     movq(xtmp1, src);
6332     movq(xtmp2, rtmp1);
6333     gf2p8affineqb(xtmp1, xtmp2, 0);
6334     movq(dst, xtmp1);
6335   } else {
6336     // Swap even and odd numbered bits.
6337     movq(rtmp1, src);
6338     mov64(rtmp2, 0x5555555555555555L);
6339     andq(rtmp1, rtmp2);
6340     shlq(rtmp1, 1);
6341     movq(dst, src);
6342     notq(rtmp2);
6343     andq(dst, rtmp2);
6344     shrq(dst, 1);
6345     orq(dst, rtmp1);
6346 
6347     // Swap LSB and MSB 2 bits of each nibble.
6348     movq(rtmp1, dst);
6349     mov64(rtmp2, 0x3333333333333333L);
6350     andq(rtmp1, rtmp2);
6351     shlq(rtmp1, 2);
6352     notq(rtmp2);
6353     andq(dst, rtmp2);
6354     shrq(dst, 2);
6355     orq(dst, rtmp1);
6356 
6357     // Swap LSB and MSB 4 bits of each byte.
6358     movq(rtmp1, dst);
6359     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6360     andq(rtmp1, rtmp2);
6361     shlq(rtmp1, 4);
6362     notq(rtmp2);
6363     andq(dst, rtmp2);
6364     shrq(dst, 4);
6365     orq(dst, rtmp1);
6366   }
6367   bswapq(dst);
6368 }
6369 
6370 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6371   Label done;
6372   Label neg_divisor_fastpath;
6373   cmpq(divisor, 0);
6374   jccb(Assembler::less, neg_divisor_fastpath);
6375   xorl(rdx, rdx);
6376   divq(divisor);
6377   jmpb(done);
6378   bind(neg_divisor_fastpath);
6379   // Fastpath for divisor < 0:
6380   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6381   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6382   movq(rdx, rax);
6383   subq(rdx, divisor);
6384   if (VM_Version::supports_bmi1()) {
6385     andnq(rax, rdx, rax);
6386   } else {
6387     notq(rdx);
6388     andq(rax, rdx);
6389   }
6390   shrq(rax, 63);
6391   bind(done);
6392 }
6393 
6394 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6395   Label done;
6396   Label neg_divisor_fastpath;
6397   cmpq(divisor, 0);
6398   jccb(Assembler::less, neg_divisor_fastpath);
6399   xorq(rdx, rdx);
6400   divq(divisor);
6401   jmp(done);
6402   bind(neg_divisor_fastpath);
6403   // Fastpath when divisor < 0:
6404   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6405   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6406   movq(rdx, rax);
6407   subq(rax, divisor);
6408   if (VM_Version::supports_bmi1()) {
6409     andnq(rax, rax, rdx);
6410   } else {
6411     notq(rax);
6412     andq(rax, rdx);
6413   }
6414   sarq(rax, 63);
6415   andq(rax, divisor);
6416   subq(rdx, rax);
6417   bind(done);
6418 }
6419 
6420 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6421   Label done;
6422   Label neg_divisor_fastpath;
6423   cmpq(divisor, 0);
6424   jccb(Assembler::less, neg_divisor_fastpath);
6425   xorq(rdx, rdx);
6426   divq(divisor);
6427   jmp(done);
6428   bind(neg_divisor_fastpath);
6429   // Fastpath for divisor < 0:
6430   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6431   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6432   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6433   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6434   movq(rdx, rax);
6435   subq(rax, divisor);
6436   if (VM_Version::supports_bmi1()) {
6437     andnq(rax, rax, rdx);
6438   } else {
6439     notq(rax);
6440     andq(rax, rdx);
6441   }
6442   movq(tmp, rax);
6443   shrq(rax, 63); // quotient
6444   sarq(tmp, 63);
6445   andq(tmp, divisor);
6446   subq(rdx, tmp); // remainder
6447   bind(done);
6448 }
6449 #endif
6450 
6451 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6452                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6453                                         int vlen_enc) {
6454   assert(VM_Version::supports_avx512bw(), "");
6455   // Byte shuffles are inlane operations and indices are determined using
6456   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6457   // normalized to index range 0-15. This makes sure that all the multiples
6458   // of an index value are placed at same relative position in 128 bit
6459   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6460   // will be 16th element in their respective 128 bit lanes.
6461   movl(rtmp, 16);
6462   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6463 
6464   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6465   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6466   // original shuffle indices and move the shuffled lanes corresponding to true
6467   // mask to destination vector.
6468   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6469   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6470   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6471 
6472   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6473   // and broadcasting second 128 bit lane.
6474   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6475   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6476   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6477   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6478   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6479 
6480   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6481   // and broadcasting third 128 bit lane.
6482   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6483   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6484   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6485   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6486   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6487 
6488   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6489   // and broadcasting third 128 bit lane.
6490   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6491   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6492   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6493   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6494   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6495 }
6496 
6497 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6498                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6499   if (vlen_enc == AVX_128bit) {
6500     vpermilps(dst, src, shuffle, vlen_enc);
6501   } else if (bt == T_INT) {
6502     vpermd(dst, shuffle, src, vlen_enc);
6503   } else {
6504     assert(bt == T_FLOAT, "");
6505     vpermps(dst, shuffle, src, vlen_enc);
6506   }
6507 }