1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/globals.hpp"
  37 #include "runtime/objectMonitor.hpp"
  38 #include "runtime/stubRoutines.hpp"
  39 #include "utilities/checkedCast.hpp"
  40 #include "utilities/globalDefinitions.hpp"
  41 #include "utilities/powerOfTwo.hpp"
  42 #include "utilities/sizes.hpp"
  43 
  44 #ifdef PRODUCT
  45 #define BLOCK_COMMENT(str) /* nothing */
  46 #define STOP(error) stop(error)
  47 #else
  48 #define BLOCK_COMMENT(str) block_comment(str)
  49 #define STOP(error) block_comment(error); stop(error)
  50 #endif
  51 
  52 // C2 compiled method's prolog code.
  53 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  54 
  55   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  56   // NativeJump::patch_verified_entry will be able to patch out the entry
  57   // code safely. The push to verify stack depth is ok at 5 bytes,
  58   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  59   // stack bang then we must use the 6 byte frame allocation even if
  60   // we have no frame. :-(
  61   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  62 
  63   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  64   // Remove word for return addr
  65   framesize -= wordSize;
  66   stack_bang_size -= wordSize;
  67 
  68   // Calls to C2R adapters often do not accept exceptional returns.
  69   // We require that their callers must bang for them.  But be careful, because
  70   // some VM calls (such as call site linkage) can use several kilobytes of
  71   // stack.  But the stack safety zone should account for that.
  72   // See bugs 4446381, 4468289, 4497237.
  73   if (stack_bang_size > 0) {
  74     generate_stack_overflow_check(stack_bang_size);
  75 
  76     // We always push rbp, so that on return to interpreter rbp, will be
  77     // restored correctly and we can correct the stack.
  78     push(rbp);
  79     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  80     if (PreserveFramePointer) {
  81       mov(rbp, rsp);
  82     }
  83     // Remove word for ebp
  84     framesize -= wordSize;
  85 
  86     // Create frame
  87     if (framesize) {
  88       subptr(rsp, framesize);
  89     }
  90   } else {
  91     // Create frame (force generation of a 4 byte immediate value)
  92     subptr_imm32(rsp, framesize);
  93 
  94     // Save RBP register now.
  95     framesize -= wordSize;
  96     movptr(Address(rsp, framesize), rbp);
  97     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  98     if (PreserveFramePointer) {
  99       movptr(rbp, rsp);
 100       if (framesize > 0) {
 101         addptr(rbp, framesize);
 102       }
 103     }
 104   }
 105 
 106   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 107     framesize -= wordSize;
 108     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 109   }
 110 
 111 #ifndef _LP64
 112   // If method sets FPU control word do it now
 113   if (fp_mode_24b) {
 114     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 115   }
 116   if (UseSSE >= 2 && VerifyFPU) {
 117     verify_FPU(0, "FPU stack must be clean on entry");
 118   }
 119 #endif
 120 
 121 #ifdef ASSERT
 122   if (VerifyStackAtCalls) {
 123     Label L;
 124     push(rax);
 125     mov(rax, rsp);
 126     andptr(rax, StackAlignmentInBytes-1);
 127     cmpptr(rax, StackAlignmentInBytes-wordSize);
 128     pop(rax);
 129     jcc(Assembler::equal, L);
 130     STOP("Stack is not properly aligned!");
 131     bind(L);
 132   }
 133 #endif
 134 
 135   if (!is_stub) {
 136     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 137  #ifdef _LP64
 138     if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
 139       // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 140       Label dummy_slow_path;
 141       Label dummy_continuation;
 142       Label* slow_path = &dummy_slow_path;
 143       Label* continuation = &dummy_continuation;
 144       if (!Compile::current()->output()->in_scratch_emit_size()) {
 145         // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 146         C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 147         Compile::current()->output()->add_stub(stub);
 148         slow_path = &stub->entry();
 149         continuation = &stub->continuation();
 150       }
 151       bs->nmethod_entry_barrier(this, slow_path, continuation);
 152     }
 153 #else
 154     // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 155     bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */);
 156 #endif
 157   }
 158 }
 159 
 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 161   switch (vlen_in_bytes) {
 162     case  4: // fall-through
 163     case  8: // fall-through
 164     case 16: return Assembler::AVX_128bit;
 165     case 32: return Assembler::AVX_256bit;
 166     case 64: return Assembler::AVX_512bit;
 167 
 168     default: {
 169       ShouldNotReachHere();
 170       return Assembler::AVX_NoVec;
 171     }
 172   }
 173 }
 174 
 175 #if INCLUDE_RTM_OPT
 176 
 177 // Update rtm_counters based on abort status
 178 // input: abort_status
 179 //        rtm_counters (RTMLockingCounters*)
 180 // flags are killed
 181 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
 182 
 183   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
 184   if (PrintPreciseRTMLockingStatistics) {
 185     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
 186       Label check_abort;
 187       testl(abort_status, (1<<i));
 188       jccb(Assembler::equal, check_abort);
 189       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
 190       bind(check_abort);
 191     }
 192   }
 193 }
 194 
 195 // Branch if (random & (count-1) != 0), count is 2^n
 196 // tmp, scr and flags are killed
 197 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
 198   assert(tmp == rax, "");
 199   assert(scr == rdx, "");
 200   rdtsc(); // modifies EDX:EAX
 201   andptr(tmp, count-1);
 202   jccb(Assembler::notZero, brLabel);
 203 }
 204 
 205 // Perform abort ratio calculation, set no_rtm bit if high ratio
 206 // input:  rtm_counters_Reg (RTMLockingCounters* address)
 207 // tmpReg, rtm_counters_Reg and flags are killed
 208 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
 209                                                     Register rtm_counters_Reg,
 210                                                     RTMLockingCounters* rtm_counters,
 211                                                     Metadata* method_data) {
 212   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 213 
 214   if (RTMLockingCalculationDelay > 0) {
 215     // Delay calculation
 216     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()));
 217     testptr(tmpReg, tmpReg);
 218     jccb(Assembler::equal, L_done);
 219   }
 220   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 221   //   Aborted transactions = abort_count * 100
 222   //   All transactions = total_count *  RTMTotalCountIncrRate
 223   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 224 
 225   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 226   cmpptr(tmpReg, RTMAbortThreshold);
 227   jccb(Assembler::below, L_check_always_rtm2);
 228   imulptr(tmpReg, tmpReg, 100);
 229 
 230   Register scrReg = rtm_counters_Reg;
 231   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 232   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 233   imulptr(scrReg, scrReg, RTMAbortRatio);
 234   cmpptr(tmpReg, scrReg);
 235   jccb(Assembler::below, L_check_always_rtm1);
 236   if (method_data != nullptr) {
 237     // set rtm_state to "no rtm" in MDO
 238     mov_metadata(tmpReg, method_data);
 239     lock();
 240     orl(Address(tmpReg, MethodData::rtm_state_offset()), NoRTM);
 241   }
 242   jmpb(L_done);
 243   bind(L_check_always_rtm1);
 244   // Reload RTMLockingCounters* address
 245   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 246   bind(L_check_always_rtm2);
 247   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 248   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 249   jccb(Assembler::below, L_done);
 250   if (method_data != nullptr) {
 251     // set rtm_state to "always rtm" in MDO
 252     mov_metadata(tmpReg, method_data);
 253     lock();
 254     orl(Address(tmpReg, MethodData::rtm_state_offset()), UseRTM);
 255   }
 256   bind(L_done);
 257 }
 258 
 259 // Update counters and perform abort ratio calculation
 260 // input:  abort_status_Reg
 261 // rtm_counters_Reg, flags are killed
 262 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 263                                       Register rtm_counters_Reg,
 264                                       RTMLockingCounters* rtm_counters,
 265                                       Metadata* method_data,
 266                                       bool profile_rtm) {
 267 
 268   assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 269   // update rtm counters based on rax value at abort
 270   // reads abort_status_Reg, updates flags
 271   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 272   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 273   if (profile_rtm) {
 274     // Save abort status because abort_status_Reg is used by following code.
 275     if (RTMRetryCount > 0) {
 276       push(abort_status_Reg);
 277     }
 278     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 279     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 280     // restore abort status
 281     if (RTMRetryCount > 0) {
 282       pop(abort_status_Reg);
 283     }
 284   }
 285 }
 286 
 287 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 288 // inputs: retry_count_Reg
 289 //       : abort_status_Reg
 290 // output: retry_count_Reg decremented by 1
 291 // flags are killed
 292 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 293   Label doneRetry;
 294   assert(abort_status_Reg == rax, "");
 295   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 296   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 297   // if reason is in 0x6 and retry count != 0 then retry
 298   andptr(abort_status_Reg, 0x6);
 299   jccb(Assembler::zero, doneRetry);
 300   testl(retry_count_Reg, retry_count_Reg);
 301   jccb(Assembler::zero, doneRetry);
 302   pause();
 303   decrementl(retry_count_Reg);
 304   jmp(retryLabel);
 305   bind(doneRetry);
 306 }
 307 
 308 // Spin and retry if lock is busy,
 309 // inputs: box_Reg (monitor address)
 310 //       : retry_count_Reg
 311 // output: retry_count_Reg decremented by 1
 312 //       : clear z flag if retry count exceeded
 313 // tmp_Reg, scr_Reg, flags are killed
 314 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 315                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 316   Label SpinLoop, SpinExit, doneRetry;
 317   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 318 
 319   testl(retry_count_Reg, retry_count_Reg);
 320   jccb(Assembler::zero, doneRetry);
 321   decrementl(retry_count_Reg);
 322   movptr(scr_Reg, RTMSpinLoopCount);
 323 
 324   bind(SpinLoop);
 325   pause();
 326   decrementl(scr_Reg);
 327   jccb(Assembler::lessEqual, SpinExit);
 328   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 329   testptr(tmp_Reg, tmp_Reg);
 330   jccb(Assembler::notZero, SpinLoop);
 331 
 332   bind(SpinExit);
 333   jmp(retryLabel);
 334   bind(doneRetry);
 335   incrementl(retry_count_Reg); // clear z flag
 336 }
 337 
 338 // Use RTM for normal stack locks
 339 // Input: objReg (object to lock)
 340 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 341                                          Register retry_on_abort_count_Reg,
 342                                          RTMLockingCounters* stack_rtm_counters,
 343                                          Metadata* method_data, bool profile_rtm,
 344                                          Label& DONE_LABEL, Label& IsInflated) {
 345   assert(UseRTMForStackLocks, "why call this otherwise?");
 346   assert(tmpReg == rax, "");
 347   assert(scrReg == rdx, "");
 348   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 349 
 350   if (RTMRetryCount > 0) {
 351     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 352     bind(L_rtm_retry);
 353   }
 354   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 355   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral
 356   jcc(Assembler::notZero, IsInflated);
 357 
 358   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 359     Label L_noincrement;
 360     if (RTMTotalCountIncrRate > 1) {
 361       // tmpReg, scrReg and flags are killed
 362       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 363     }
 364     assert(stack_rtm_counters != nullptr, "should not be null when profiling RTM");
 365     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 366     bind(L_noincrement);
 367   }
 368   xbegin(L_on_abort);
 369   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 370   andptr(tmpReg, markWord::lock_mask_in_place);     // look at 2 lock bits
 371   cmpptr(tmpReg, markWord::unlocked_value);         // bits = 01 unlocked
 372   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 373 
 374   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 375   if (UseRTMXendForLockBusy) {
 376     xend();
 377     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 378     jmp(L_decrement_retry);
 379   }
 380   else {
 381     xabort(0);
 382   }
 383   bind(L_on_abort);
 384   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 385     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 386   }
 387   bind(L_decrement_retry);
 388   if (RTMRetryCount > 0) {
 389     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 390     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 391   }
 392 }
 393 
 394 // Use RTM for inflating locks
 395 // inputs: objReg (object to lock)
 396 //         boxReg (on-stack box address (displaced header location) - KILLED)
 397 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 398 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 399                                             Register scrReg, Register retry_on_busy_count_Reg,
 400                                             Register retry_on_abort_count_Reg,
 401                                             RTMLockingCounters* rtm_counters,
 402                                             Metadata* method_data, bool profile_rtm,
 403                                             Label& DONE_LABEL) {
 404   assert(UseRTMLocking, "why call this otherwise?");
 405   assert(tmpReg == rax, "");
 406   assert(scrReg == rdx, "");
 407   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 408   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 409 
 410   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 411   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 412 
 413   if (RTMRetryCount > 0) {
 414     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 415     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 416     bind(L_rtm_retry);
 417   }
 418   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 419     Label L_noincrement;
 420     if (RTMTotalCountIncrRate > 1) {
 421       // tmpReg, scrReg and flags are killed
 422       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 423     }
 424     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 425     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 426     bind(L_noincrement);
 427   }
 428   xbegin(L_on_abort);
 429   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 430   movptr(tmpReg, Address(tmpReg, owner_offset));
 431   testptr(tmpReg, tmpReg);
 432   jcc(Assembler::zero, DONE_LABEL);
 433   if (UseRTMXendForLockBusy) {
 434     xend();
 435     jmp(L_decrement_retry);
 436   }
 437   else {
 438     xabort(0);
 439   }
 440   bind(L_on_abort);
 441   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 442   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 443     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 444   }
 445   if (RTMRetryCount > 0) {
 446     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 447     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 448   }
 449 
 450   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 451   testptr(tmpReg, tmpReg) ;
 452   jccb(Assembler::notZero, L_decrement_retry) ;
 453 
 454   // Appears unlocked - try to swing _owner from null to non-null.
 455   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 456 #ifdef _LP64
 457   Register threadReg = r15_thread;
 458 #else
 459   get_thread(scrReg);
 460   Register threadReg = scrReg;
 461 #endif
 462   lock();
 463   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 464 
 465   if (RTMRetryCount > 0) {
 466     // success done else retry
 467     jccb(Assembler::equal, DONE_LABEL) ;
 468     bind(L_decrement_retry);
 469     // Spin and retry if lock is busy.
 470     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 471   }
 472   else {
 473     bind(L_decrement_retry);
 474   }
 475 }
 476 
 477 #endif //  INCLUDE_RTM_OPT
 478 
 479 // fast_lock and fast_unlock used by C2
 480 
 481 // Because the transitions from emitted code to the runtime
 482 // monitorenter/exit helper stubs are so slow it's critical that
 483 // we inline both the stack-locking fast path and the inflated fast path.
 484 //
 485 // See also: cmpFastLock and cmpFastUnlock.
 486 //
 487 // What follows is a specialized inline transliteration of the code
 488 // in enter() and exit(). If we're concerned about I$ bloat another
 489 // option would be to emit TrySlowEnter and TrySlowExit methods
 490 // at startup-time.  These methods would accept arguments as
 491 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 492 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 493 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 494 // In practice, however, the # of lock sites is bounded and is usually small.
 495 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 496 // if the processor uses simple bimodal branch predictors keyed by EIP
 497 // Since the helper routines would be called from multiple synchronization
 498 // sites.
 499 //
 500 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 501 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 502 // to those specialized methods.  That'd give us a mostly platform-independent
 503 // implementation that the JITs could optimize and inline at their pleasure.
 504 // Done correctly, the only time we'd need to cross to native could would be
 505 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 506 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 507 // (b) explicit barriers or fence operations.
 508 //
 509 // TODO:
 510 //
 511 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 512 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 513 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 514 //    the lock operators would typically be faster than reifying Self.
 515 //
 516 // *  Ideally I'd define the primitives as:
 517 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 518 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 519 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 520 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 521 //    Furthermore the register assignments are overconstrained, possibly resulting in
 522 //    sub-optimal code near the synchronization site.
 523 //
 524 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 525 //    Alternately, use a better sp-proximity test.
 526 //
 527 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 528 //    Either one is sufficient to uniquely identify a thread.
 529 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 530 //
 531 // *  Intrinsify notify() and notifyAll() for the common cases where the
 532 //    object is locked by the calling thread but the waitlist is empty.
 533 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 534 //
 535 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 536 //    But beware of excessive branch density on AMD Opterons.
 537 //
 538 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 539 //    or failure of the fast path.  If the fast path fails then we pass
 540 //    control to the slow path, typically in C.  In fast_lock and
 541 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 542 //    will emit a conditional branch immediately after the node.
 543 //    So we have branches to branches and lots of ICC.ZF games.
 544 //    Instead, it might be better to have C2 pass a "FailureLabel"
 545 //    into fast_lock and fast_unlock.  In the case of success, control
 546 //    will drop through the node.  ICC.ZF is undefined at exit.
 547 //    In the case of failure, the node will branch directly to the
 548 //    FailureLabel
 549 
 550 
 551 // obj: object to lock
 552 // box: on-stack box address (displaced header location) - KILLED
 553 // rax,: tmp -- KILLED
 554 // scr: tmp -- KILLED
 555 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 556                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 557                                  RTMLockingCounters* rtm_counters,
 558                                  RTMLockingCounters* stack_rtm_counters,
 559                                  Metadata* method_data,
 560                                  bool use_rtm, bool profile_rtm) {
 561   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 562   // Ensure the register assignments are disjoint
 563   assert(tmpReg == rax, "");
 564 
 565   if (use_rtm) {
 566     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 567   } else {
 568     assert(cx1Reg == noreg, "");
 569     assert(cx2Reg == noreg, "");
 570     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 571   }
 572 
 573   // Possible cases that we'll encounter in fast_lock
 574   // ------------------------------------------------
 575   // * Inflated
 576   //    -- unlocked
 577   //    -- Locked
 578   //       = by self
 579   //       = by other
 580   // * neutral
 581   // * stack-locked
 582   //    -- by self
 583   //       = sp-proximity test hits
 584   //       = sp-proximity test generates false-negative
 585   //    -- by other
 586   //
 587 
 588   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 589 
 590   if (DiagnoseSyncOnValueBasedClasses != 0) {
 591     load_klass(tmpReg, objReg, scrReg);
 592     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 593     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 594     jcc(Assembler::notZero, DONE_LABEL);
 595   }
 596 
 597 #if INCLUDE_RTM_OPT
 598   if (UseRTMForStackLocks && use_rtm) {
 599     assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive");
 600     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 601                       stack_rtm_counters, method_data, profile_rtm,
 602                       DONE_LABEL, IsInflated);
 603   }
 604 #endif // INCLUDE_RTM_OPT
 605 
 606   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 607   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 608   jcc(Assembler::notZero, IsInflated);
 609 
 610   if (LockingMode == LM_MONITOR) {
 611     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 612     testptr(objReg, objReg);
 613   } else {
 614     assert(LockingMode == LM_LEGACY, "must be");
 615     // Attempt stack-locking ...
 616     orptr (tmpReg, markWord::unlocked_value);
 617     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 618     lock();
 619     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 620     jcc(Assembler::equal, COUNT);           // Success
 621 
 622     // Recursive locking.
 623     // The object is stack-locked: markword contains stack pointer to BasicLock.
 624     // Locked by current thread if difference with current SP is less than one page.
 625     subptr(tmpReg, rsp);
 626     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 627     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) );
 628     movptr(Address(boxReg, 0), tmpReg);
 629   }
 630   jmp(DONE_LABEL);
 631 
 632   bind(IsInflated);
 633   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 634 
 635 #if INCLUDE_RTM_OPT
 636   // Use the same RTM locking code in 32- and 64-bit VM.
 637   if (use_rtm) {
 638     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 639                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 640   } else {
 641 #endif // INCLUDE_RTM_OPT
 642 
 643 #ifndef _LP64
 644   // The object is inflated.
 645 
 646   // boxReg refers to the on-stack BasicLock in the current frame.
 647   // We'd like to write:
 648   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 649   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 650   // additional latency as we have another ST in the store buffer that must drain.
 651 
 652   // avoid ST-before-CAS
 653   // register juggle because we need tmpReg for cmpxchgptr below
 654   movptr(scrReg, boxReg);
 655   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 656 
 657   // Optimistic form: consider XORL tmpReg,tmpReg
 658   movptr(tmpReg, NULL_WORD);
 659 
 660   // Appears unlocked - try to swing _owner from null to non-null.
 661   // Ideally, I'd manifest "Self" with get_thread and then attempt
 662   // to CAS the register containing Self into m->Owner.
 663   // But we don't have enough registers, so instead we can either try to CAS
 664   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 665   // we later store "Self" into m->Owner.  Transiently storing a stack address
 666   // (rsp or the address of the box) into  m->owner is harmless.
 667   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 668   lock();
 669   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 670   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 671   // If we weren't able to swing _owner from null to the BasicLock
 672   // then take the slow path.
 673   jccb  (Assembler::notZero, NO_COUNT);
 674   // update _owner from BasicLock to thread
 675   get_thread (scrReg);                    // beware: clobbers ICCs
 676   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 677   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 678 
 679   // If the CAS fails we can either retry or pass control to the slow path.
 680   // We use the latter tactic.
 681   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 682   // If the CAS was successful ...
 683   //   Self has acquired the lock
 684   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 685   // Intentional fall-through into DONE_LABEL ...
 686 #else // _LP64
 687   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 688   movq(scrReg, tmpReg);
 689   xorq(tmpReg, tmpReg);
 690   lock();
 691   cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 692   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 693   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 694   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 695   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 696   jccb(Assembler::equal, COUNT);          // CAS above succeeded; propagate ZF = 1 (success)
 697 
 698   cmpptr(thread, rax);                // Check if we are already the owner (recursive lock)
 699   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 700   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 701   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 702 #endif // _LP64
 703 #if INCLUDE_RTM_OPT
 704   } // use_rtm()
 705 #endif
 706   bind(DONE_LABEL);
 707 
 708   // ZFlag == 1 count in fast path
 709   // ZFlag == 0 count in slow path
 710   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 711 
 712   bind(COUNT);
 713   // Count monitors in fast path
 714   increment(Address(thread, JavaThread::held_monitor_count_offset()));
 715 
 716   xorl(tmpReg, tmpReg); // Set ZF == 1
 717 
 718   bind(NO_COUNT);
 719 
 720   // At NO_COUNT the icc ZFlag is set as follows ...
 721   // fast_unlock uses the same protocol.
 722   // ZFlag == 1 -> Success
 723   // ZFlag == 0 -> Failure - force control through the slow path
 724 }
 725 
 726 // obj: object to unlock
 727 // box: box address (displaced header location), killed.  Must be EAX.
 728 // tmp: killed, cannot be obj nor box.
 729 //
 730 // Some commentary on balanced locking:
 731 //
 732 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 733 // Methods that don't have provably balanced locking are forced to run in the
 734 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 735 // The interpreter provides two properties:
 736 // I1:  At return-time the interpreter automatically and quietly unlocks any
 737 //      objects acquired the current activation (frame).  Recall that the
 738 //      interpreter maintains an on-stack list of locks currently held by
 739 //      a frame.
 740 // I2:  If a method attempts to unlock an object that is not held by the
 741 //      the frame the interpreter throws IMSX.
 742 //
 743 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 744 // B() doesn't have provably balanced locking so it runs in the interpreter.
 745 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 746 // is still locked by A().
 747 //
 748 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 749 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 750 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 751 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 752 // Arguably given that the spec legislates the JNI case as undefined our implementation
 753 // could reasonably *avoid* checking owner in fast_unlock().
 754 // In the interest of performance we elide m->Owner==Self check in unlock.
 755 // A perfectly viable alternative is to elide the owner check except when
 756 // Xcheck:jni is enabled.
 757 
 758 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 759   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 760   assert(boxReg == rax, "");
 761   assert_different_registers(objReg, boxReg, tmpReg);
 762 
 763   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 764 
 765 #if INCLUDE_RTM_OPT
 766   if (UseRTMForStackLocks && use_rtm) {
 767     assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive");
 768     Label L_regular_unlock;
 769     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 770     andptr(tmpReg, markWord::lock_mask_in_place);                     // look at 2 lock bits
 771     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 01 unlocked
 772     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 773     xend();                                                           // otherwise end...
 774     jmp(DONE_LABEL);                                                  // ... and we're done
 775     bind(L_regular_unlock);
 776   }
 777 #endif
 778 
 779   if (LockingMode == LM_LEGACY) {
 780     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 781     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 782   }
 783   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 784   if (LockingMode != LM_MONITOR) {
 785     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 786     jcc(Assembler::zero, Stacked);
 787   }
 788 
 789   // It's inflated.
 790 
 791 #if INCLUDE_RTM_OPT
 792   if (use_rtm) {
 793     Label L_regular_inflated_unlock;
 794     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 795     movptr(boxReg, Address(tmpReg, owner_offset));
 796     testptr(boxReg, boxReg);
 797     jccb(Assembler::notZero, L_regular_inflated_unlock);
 798     xend();
 799     jmp(DONE_LABEL);
 800     bind(L_regular_inflated_unlock);
 801   }
 802 #endif
 803 
 804   // Despite our balanced locking property we still check that m->_owner == Self
 805   // as java routines or native JNI code called by this thread might
 806   // have released the lock.
 807   // Refer to the comments in synchronizer.cpp for how we might encode extra
 808   // state in _succ so we can avoid fetching EntryList|cxq.
 809   //
 810   // If there's no contention try a 1-0 exit.  That is, exit without
 811   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 812   // we detect and recover from the race that the 1-0 exit admits.
 813   //
 814   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 815   // before it STs null into _owner, releasing the lock.  Updates
 816   // to data protected by the critical section must be visible before
 817   // we drop the lock (and thus before any other thread could acquire
 818   // the lock and observe the fields protected by the lock).
 819   // IA32's memory-model is SPO, so STs are ordered with respect to
 820   // each other and there's no need for an explicit barrier (fence).
 821   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 822 #ifndef _LP64
 823   // Note that we could employ various encoding schemes to reduce
 824   // the number of loads below (currently 4) to just 2 or 3.
 825   // Refer to the comments in synchronizer.cpp.
 826   // In practice the chain of fetches doesn't seem to impact performance, however.
 827   xorptr(boxReg, boxReg);
 828   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 829   jccb  (Assembler::notZero, DONE_LABEL);
 830   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 831   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 832   jccb  (Assembler::notZero, DONE_LABEL);
 833   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 834   jmpb  (DONE_LABEL);
 835 #else // _LP64
 836   // It's inflated
 837   Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath;
 838 
 839   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 840   jccb(Assembler::equal, LNotRecursive);
 841 
 842   // Recursive inflated unlock
 843   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 844   jmpb(LSuccess);
 845 
 846   bind(LNotRecursive);
 847   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 848   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 849   jccb  (Assembler::notZero, CheckSucc);
 850   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 851   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 852   jmpb  (DONE_LABEL);
 853 
 854   // Try to avoid passing control into the slow_path ...
 855   bind  (CheckSucc);
 856 
 857   // The following optional optimization can be elided if necessary
 858   // Effectively: if (succ == null) goto slow path
 859   // The code reduces the window for a race, however,
 860   // and thus benefits performance.
 861   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 862   jccb  (Assembler::zero, LGoSlowPath);
 863 
 864   xorptr(boxReg, boxReg);
 865   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 866   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 867 
 868   // Memory barrier/fence
 869   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 870   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 871   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 872   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 873   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 874   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 875   lock(); addl(Address(rsp, 0), 0);
 876 
 877   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 878   jccb  (Assembler::notZero, LSuccess);
 879 
 880   // Rare inopportune interleaving - race.
 881   // The successor vanished in the small window above.
 882   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 883   // We need to ensure progress and succession.
 884   // Try to reacquire the lock.
 885   // If that fails then the new owner is responsible for succession and this
 886   // thread needs to take no further action and can exit via the fast path (success).
 887   // If the re-acquire succeeds then pass control into the slow path.
 888   // As implemented, this latter mode is horrible because we generated more
 889   // coherence traffic on the lock *and* artificially extended the critical section
 890   // length while by virtue of passing control into the slow path.
 891 
 892   // box is really RAX -- the following CMPXCHG depends on that binding
 893   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 894   lock();
 895   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 896   // There's no successor so we tried to regrab the lock.
 897   // If that didn't work, then another thread grabbed the
 898   // lock so we're done (and exit was a success).
 899   jccb  (Assembler::notEqual, LSuccess);
 900   // Intentional fall-through into slow path
 901 
 902   bind  (LGoSlowPath);
 903   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 904   jmpb  (DONE_LABEL);
 905 
 906   bind  (LSuccess);
 907   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 908   jmpb  (DONE_LABEL);
 909 
 910 #endif
 911   if (LockingMode == LM_LEGACY) {
 912     bind  (Stacked);
 913     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 914     lock();
 915     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 916     // Intentional fall-thru into DONE_LABEL
 917   }
 918 
 919   bind(DONE_LABEL);
 920 
 921   // ZFlag == 1 count in fast path
 922   // ZFlag == 0 count in slow path
 923   jccb(Assembler::notZero, NO_COUNT);
 924 
 925   bind(COUNT);
 926   // Count monitors in fast path
 927 #ifndef _LP64
 928   get_thread(tmpReg);
 929   decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 930 #else // _LP64
 931   decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 932 #endif
 933 
 934   xorl(tmpReg, tmpReg); // Set ZF == 1
 935 
 936   bind(NO_COUNT);
 937 }
 938 
 939 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 940                                               Register t, Register thread) {
 941   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 942   assert(rax_reg == rax, "Used for CAS");
 943   assert_different_registers(obj, box, rax_reg, t, thread);
 944 
 945   // Handle inflated monitor.
 946   Label inflated;
 947   // Finish fast lock successfully. ZF value is irrelevant.
 948   Label locked;
 949   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 950   Label slow_path;
 951 
 952   if (DiagnoseSyncOnValueBasedClasses != 0) {
 953     load_klass(rax_reg, obj, t);
 954     movl(rax_reg, Address(rax_reg, Klass::access_flags_offset()));
 955     testl(rax_reg, JVM_ACC_IS_VALUE_BASED_CLASS);
 956     jcc(Assembler::notZero, slow_path);
 957   }
 958 
 959   const Register mark = t;
 960 
 961   { // Lightweight Lock
 962 
 963     Label push;
 964 
 965     const Register top = box;
 966 
 967     // Load the mark.
 968     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 969 
 970     // Prefetch top.
 971     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
 972 
 973     // Check for monitor (0b10).
 974     testptr(mark, markWord::monitor_value);
 975     jcc(Assembler::notZero, inflated);
 976 
 977     // Check if lock-stack is full.
 978     cmpl(top, LockStack::end_offset() - 1);
 979     jcc(Assembler::greater, slow_path);
 980 
 981     // Check if recursive.
 982     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
 983     jccb(Assembler::equal, push);
 984 
 985     // Try to lock. Transition lock bits 0b01 => 0b00
 986     movptr(rax_reg, mark);
 987     orptr(rax_reg, markWord::unlocked_value);
 988     andptr(mark, ~(int32_t)markWord::unlocked_value);
 989     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 990     jcc(Assembler::notEqual, slow_path);
 991 
 992     bind(push);
 993     // After successful lock, push object on lock-stack.
 994     movptr(Address(thread, top), obj);
 995     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
 996     jmpb(locked);
 997   }
 998 
 999   { // Handle inflated monitor.
1000     bind(inflated);
1001 
1002     const Register tagged_monitor = mark;
1003 
1004     // CAS owner (null => current thread).
1005     xorptr(rax_reg, rax_reg);
1006     lock(); cmpxchgptr(thread, Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1007     jccb(Assembler::equal, locked);
1008 
1009     // Check if recursive.
1010     cmpptr(thread, rax_reg);
1011     jccb(Assembler::notEqual, slow_path);
1012 
1013     // Recursive.
1014     increment(Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1015   }
1016 
1017   bind(locked);
1018   increment(Address(thread, JavaThread::held_monitor_count_offset()));
1019   // Set ZF = 1
1020   xorl(rax_reg, rax_reg);
1021 
1022 #ifdef ASSERT
1023   // Check that locked label is reached with ZF set.
1024   Label zf_correct;
1025   Label zf_bad_zero;
1026   jcc(Assembler::zero, zf_correct);
1027   jmp(zf_bad_zero);
1028 #endif
1029 
1030   bind(slow_path);
1031 #ifdef ASSERT
1032   // Check that slow_path label is reached with ZF not set.
1033   jcc(Assembler::notZero, zf_correct);
1034   stop("Fast Lock ZF != 0");
1035   bind(zf_bad_zero);
1036   stop("Fast Lock ZF != 1");
1037   bind(zf_correct);
1038 #endif
1039   // C2 uses the value of ZF to determine the continuation.
1040 }
1041 
1042 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
1043   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
1044   assert(reg_rax == rax, "Used for CAS");
1045   assert_different_registers(obj, reg_rax, t);
1046 
1047   // Handle inflated monitor.
1048   Label inflated, inflated_check_lock_stack;
1049   // Finish fast unlock successfully.  MUST jump with ZF == 1
1050   Label unlocked;
1051 
1052   // Assume success.
1053   decrement(Address(thread, JavaThread::held_monitor_count_offset()));
1054 
1055   const Register mark = t;
1056   const Register top = reg_rax;
1057 
1058   Label dummy;
1059   C2FastUnlockLightweightStub* stub = nullptr;
1060 
1061   if (!Compile::current()->output()->in_scratch_emit_size()) {
1062     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
1063     Compile::current()->output()->add_stub(stub);
1064   }
1065 
1066   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
1067   Label& check_successor = stub == nullptr ? dummy : stub->check_successor();
1068 
1069   { // Lightweight Unlock
1070 
1071     // Load top.
1072     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
1073 
1074     // Prefetch mark.
1075     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
1076 
1077     // Check if obj is top of lock-stack.
1078     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
1079     // Top of lock stack was not obj. Must be monitor.
1080     jcc(Assembler::notEqual, inflated_check_lock_stack);
1081 
1082     // Pop lock-stack.
1083     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
1084     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
1085 
1086     // Check if recursive.
1087     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
1088     jcc(Assembler::equal, unlocked);
1089 
1090     // We elide the monitor check, let the CAS fail instead.
1091 
1092     // Try to unlock. Transition lock bits 0b00 => 0b01
1093     movptr(reg_rax, mark);
1094     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
1095     orptr(mark, markWord::unlocked_value);
1096     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
1097     jcc(Assembler::notEqual, push_and_slow_path);
1098     jmp(unlocked);
1099   }
1100 
1101 
1102   { // Handle inflated monitor.
1103     bind(inflated_check_lock_stack);
1104 #ifdef ASSERT
1105     Label check_done;
1106     subl(top, oopSize);
1107     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
1108     jcc(Assembler::below, check_done);
1109     cmpptr(obj, Address(thread, top));
1110     jccb(Assembler::notEqual, inflated_check_lock_stack);
1111     stop("Fast Unlock lock on stack");
1112     bind(check_done);
1113     testptr(mark, markWord::monitor_value);
1114     jccb(Assembler::notZero, inflated);
1115     stop("Fast Unlock not monitor");
1116 #endif
1117 
1118     bind(inflated);
1119 
1120     // mark contains the tagged ObjectMonitor*.
1121     const Register monitor = mark;
1122 
1123 #ifndef _LP64
1124     // Check if recursive.
1125     xorptr(reg_rax, reg_rax);
1126     orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1127     jcc(Assembler::notZero, check_successor);
1128 
1129     // Check if the entry lists are empty.
1130     movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
1131     orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
1132     jcc(Assembler::notZero, check_successor);
1133 
1134     // Release lock.
1135     movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
1136 #else // _LP64
1137     Label recursive;
1138 
1139     // Check if recursive.
1140     cmpptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
1141     jccb(Assembler::notEqual, recursive);
1142 
1143     // Check if the entry lists are empty.
1144     movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
1145     orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
1146     jcc(Assembler::notZero, check_successor);
1147 
1148     // Release lock.
1149     movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
1150     jmpb(unlocked);
1151 
1152     // Recursive unlock.
1153     bind(recursive);
1154     decrement(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1155     xorl(t, t);
1156 #endif
1157   }
1158 
1159   bind(unlocked);
1160   if (stub != nullptr) {
1161     bind(stub->unlocked_continuation());
1162   }
1163 
1164 #ifdef ASSERT
1165   // Check that unlocked label is reached with ZF set.
1166   Label zf_correct;
1167   jcc(Assembler::zero, zf_correct);
1168   stop("Fast Unlock ZF != 1");
1169 #endif
1170 
1171   if (stub != nullptr) {
1172     bind(stub->slow_path_continuation());
1173   }
1174 #ifdef ASSERT
1175   // Check that stub->continuation() label is reached with ZF not set.
1176   jccb(Assembler::notZero, zf_correct);
1177   stop("Fast Unlock ZF != 0");
1178   bind(zf_correct);
1179 #endif
1180   // C2 uses the value of ZF to determine the continuation.
1181 }
1182 
1183 //-------------------------------------------------------------------------------------------
1184 // Generic instructions support for use in .ad files C2 code generation
1185 
1186 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
1187   if (dst != src) {
1188     movdqu(dst, src);
1189   }
1190   if (opcode == Op_AbsVD) {
1191     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
1192   } else {
1193     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
1194     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1195   }
1196 }
1197 
1198 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
1199   if (opcode == Op_AbsVD) {
1200     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
1201   } else {
1202     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
1203     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
1204   }
1205 }
1206 
1207 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
1208   if (dst != src) {
1209     movdqu(dst, src);
1210   }
1211   if (opcode == Op_AbsVF) {
1212     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
1213   } else {
1214     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
1215     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1216   }
1217 }
1218 
1219 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
1220   if (opcode == Op_AbsVF) {
1221     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
1222   } else {
1223     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
1224     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
1225   }
1226 }
1227 
1228 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
1229   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1230   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
1231 
1232   if (opcode == Op_MinV) {
1233     if (elem_bt == T_BYTE) {
1234       pminsb(dst, src);
1235     } else if (elem_bt == T_SHORT) {
1236       pminsw(dst, src);
1237     } else if (elem_bt == T_INT) {
1238       pminsd(dst, src);
1239     } else {
1240       assert(elem_bt == T_LONG, "required");
1241       assert(tmp == xmm0, "required");
1242       assert_different_registers(dst, src, tmp);
1243       movdqu(xmm0, dst);
1244       pcmpgtq(xmm0, src);
1245       blendvpd(dst, src);  // xmm0 as mask
1246     }
1247   } else { // opcode == Op_MaxV
1248     if (elem_bt == T_BYTE) {
1249       pmaxsb(dst, src);
1250     } else if (elem_bt == T_SHORT) {
1251       pmaxsw(dst, src);
1252     } else if (elem_bt == T_INT) {
1253       pmaxsd(dst, src);
1254     } else {
1255       assert(elem_bt == T_LONG, "required");
1256       assert(tmp == xmm0, "required");
1257       assert_different_registers(dst, src, tmp);
1258       movdqu(xmm0, src);
1259       pcmpgtq(xmm0, dst);
1260       blendvpd(dst, src);  // xmm0 as mask
1261     }
1262   }
1263 }
1264 
1265 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1266                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1267                                  int vlen_enc) {
1268   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1269 
1270   if (opcode == Op_MinV) {
1271     if (elem_bt == T_BYTE) {
1272       vpminsb(dst, src1, src2, vlen_enc);
1273     } else if (elem_bt == T_SHORT) {
1274       vpminsw(dst, src1, src2, vlen_enc);
1275     } else if (elem_bt == T_INT) {
1276       vpminsd(dst, src1, src2, vlen_enc);
1277     } else {
1278       assert(elem_bt == T_LONG, "required");
1279       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1280         vpminsq(dst, src1, src2, vlen_enc);
1281       } else {
1282         assert_different_registers(dst, src1, src2);
1283         vpcmpgtq(dst, src1, src2, vlen_enc);
1284         vblendvpd(dst, src1, src2, dst, vlen_enc);
1285       }
1286     }
1287   } else { // opcode == Op_MaxV
1288     if (elem_bt == T_BYTE) {
1289       vpmaxsb(dst, src1, src2, vlen_enc);
1290     } else if (elem_bt == T_SHORT) {
1291       vpmaxsw(dst, src1, src2, vlen_enc);
1292     } else if (elem_bt == T_INT) {
1293       vpmaxsd(dst, src1, src2, vlen_enc);
1294     } else {
1295       assert(elem_bt == T_LONG, "required");
1296       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1297         vpmaxsq(dst, src1, src2, vlen_enc);
1298       } else {
1299         assert_different_registers(dst, src1, src2);
1300         vpcmpgtq(dst, src1, src2, vlen_enc);
1301         vblendvpd(dst, src2, src1, dst, vlen_enc);
1302       }
1303     }
1304   }
1305 }
1306 
1307 // Float/Double min max
1308 
1309 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1310                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1311                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1312                                    int vlen_enc) {
1313   assert(UseAVX > 0, "required");
1314   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1315          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1316   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1317   assert_different_registers(a, tmp, atmp, btmp);
1318   assert_different_registers(b, tmp, atmp, btmp);
1319 
1320   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1321   bool is_double_word = is_double_word_type(elem_bt);
1322 
1323   /* Note on 'non-obvious' assembly sequence:
1324    *
1325    * While there are vminps/vmaxps instructions, there are two important differences between hardware
1326    * and Java on how they handle floats:
1327    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
1328    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
1329    *
1330    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
1331    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
1332    *                (only useful when signs differ, noop otherwise)
1333    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
1334 
1335    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
1336    *   btmp = (b < +0.0) ? a : b
1337    *   atmp = (b < +0.0) ? b : a
1338    *   Tmp  = Max_Float(atmp , btmp)
1339    *   Res  = (atmp == NaN) ? atmp : Tmp
1340    */
1341 
1342   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
1343   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
1344   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
1345   XMMRegister mask;
1346 
1347   if (!is_double_word && is_min) {
1348     mask = a;
1349     vblend = &MacroAssembler::vblendvps;
1350     vmaxmin = &MacroAssembler::vminps;
1351     vcmp = &MacroAssembler::vcmpps;
1352   } else if (!is_double_word && !is_min) {
1353     mask = b;
1354     vblend = &MacroAssembler::vblendvps;
1355     vmaxmin = &MacroAssembler::vmaxps;
1356     vcmp = &MacroAssembler::vcmpps;
1357   } else if (is_double_word && is_min) {
1358     mask = a;
1359     vblend = &MacroAssembler::vblendvpd;
1360     vmaxmin = &MacroAssembler::vminpd;
1361     vcmp = &MacroAssembler::vcmppd;
1362   } else {
1363     assert(is_double_word && !is_min, "sanity");
1364     mask = b;
1365     vblend = &MacroAssembler::vblendvpd;
1366     vmaxmin = &MacroAssembler::vmaxpd;
1367     vcmp = &MacroAssembler::vcmppd;
1368   }
1369 
1370   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1371   XMMRegister maxmin, scratch;
1372   if (dst == btmp) {
1373     maxmin = btmp;
1374     scratch = tmp;
1375   } else {
1376     maxmin = tmp;
1377     scratch = btmp;
1378   }
1379 
1380   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1381   if (precompute_mask && !is_double_word) {
1382     vpsrad(tmp, mask, 32, vlen_enc);
1383     mask = tmp;
1384   } else if (precompute_mask && is_double_word) {
1385     vpxor(tmp, tmp, tmp, vlen_enc);
1386     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1387     mask = tmp;
1388   }
1389 
1390   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1391   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1392   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1393   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1394   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1395 }
1396 
1397 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1398                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1399                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1400                                     int vlen_enc) {
1401   assert(UseAVX > 2, "required");
1402   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1403          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1404   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1405   assert_different_registers(dst, a, atmp, btmp);
1406   assert_different_registers(dst, b, atmp, btmp);
1407 
1408   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1409   bool is_double_word = is_double_word_type(elem_bt);
1410   bool merge = true;
1411 
1412   if (!is_double_word && is_min) {
1413     evpmovd2m(ktmp, a, vlen_enc);
1414     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1415     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1416     vminps(dst, atmp, btmp, vlen_enc);
1417     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1418     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1419   } else if (!is_double_word && !is_min) {
1420     evpmovd2m(ktmp, b, vlen_enc);
1421     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1422     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1423     vmaxps(dst, atmp, btmp, vlen_enc);
1424     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1425     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1426   } else if (is_double_word && is_min) {
1427     evpmovq2m(ktmp, a, vlen_enc);
1428     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1429     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1430     vminpd(dst, atmp, btmp, vlen_enc);
1431     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1432     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1433   } else {
1434     assert(is_double_word && !is_min, "sanity");
1435     evpmovq2m(ktmp, b, vlen_enc);
1436     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1437     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1438     vmaxpd(dst, atmp, btmp, vlen_enc);
1439     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1440     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1441   }
1442 }
1443 
1444 // Float/Double signum
1445 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1446   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1447 
1448   Label DONE_LABEL;
1449 
1450   if (opcode == Op_SignumF) {
1451     assert(UseSSE > 0, "required");
1452     ucomiss(dst, zero);
1453     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1454     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1455     movflt(dst, one);
1456     jcc(Assembler::above, DONE_LABEL);
1457     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1458   } else if (opcode == Op_SignumD) {
1459     assert(UseSSE > 1, "required");
1460     ucomisd(dst, zero);
1461     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1462     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1463     movdbl(dst, one);
1464     jcc(Assembler::above, DONE_LABEL);
1465     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1466   }
1467 
1468   bind(DONE_LABEL);
1469 }
1470 
1471 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1472   if (sign) {
1473     pmovsxbw(dst, src);
1474   } else {
1475     pmovzxbw(dst, src);
1476   }
1477 }
1478 
1479 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1480   if (sign) {
1481     vpmovsxbw(dst, src, vector_len);
1482   } else {
1483     vpmovzxbw(dst, src, vector_len);
1484   }
1485 }
1486 
1487 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1488   if (sign) {
1489     vpmovsxbd(dst, src, vector_len);
1490   } else {
1491     vpmovzxbd(dst, src, vector_len);
1492   }
1493 }
1494 
1495 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1496   if (sign) {
1497     vpmovsxwd(dst, src, vector_len);
1498   } else {
1499     vpmovzxwd(dst, src, vector_len);
1500   }
1501 }
1502 
1503 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1504                                      int shift, int vector_len) {
1505   if (opcode == Op_RotateLeftV) {
1506     if (etype == T_INT) {
1507       evprold(dst, src, shift, vector_len);
1508     } else {
1509       assert(etype == T_LONG, "expected type T_LONG");
1510       evprolq(dst, src, shift, vector_len);
1511     }
1512   } else {
1513     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1514     if (etype == T_INT) {
1515       evprord(dst, src, shift, vector_len);
1516     } else {
1517       assert(etype == T_LONG, "expected type T_LONG");
1518       evprorq(dst, src, shift, vector_len);
1519     }
1520   }
1521 }
1522 
1523 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1524                                      XMMRegister shift, int vector_len) {
1525   if (opcode == Op_RotateLeftV) {
1526     if (etype == T_INT) {
1527       evprolvd(dst, src, shift, vector_len);
1528     } else {
1529       assert(etype == T_LONG, "expected type T_LONG");
1530       evprolvq(dst, src, shift, vector_len);
1531     }
1532   } else {
1533     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1534     if (etype == T_INT) {
1535       evprorvd(dst, src, shift, vector_len);
1536     } else {
1537       assert(etype == T_LONG, "expected type T_LONG");
1538       evprorvq(dst, src, shift, vector_len);
1539     }
1540   }
1541 }
1542 
1543 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1544   if (opcode == Op_RShiftVI) {
1545     psrad(dst, shift);
1546   } else if (opcode == Op_LShiftVI) {
1547     pslld(dst, shift);
1548   } else {
1549     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1550     psrld(dst, shift);
1551   }
1552 }
1553 
1554 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1555   switch (opcode) {
1556     case Op_RShiftVI:  psrad(dst, shift); break;
1557     case Op_LShiftVI:  pslld(dst, shift); break;
1558     case Op_URShiftVI: psrld(dst, shift); break;
1559 
1560     default: assert(false, "%s", NodeClassNames[opcode]);
1561   }
1562 }
1563 
1564 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1565   if (opcode == Op_RShiftVI) {
1566     vpsrad(dst, nds, shift, vector_len);
1567   } else if (opcode == Op_LShiftVI) {
1568     vpslld(dst, nds, shift, vector_len);
1569   } else {
1570     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1571     vpsrld(dst, nds, shift, vector_len);
1572   }
1573 }
1574 
1575 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1576   switch (opcode) {
1577     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1578     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1579     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1580 
1581     default: assert(false, "%s", NodeClassNames[opcode]);
1582   }
1583 }
1584 
1585 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1586   switch (opcode) {
1587     case Op_RShiftVB:  // fall-through
1588     case Op_RShiftVS:  psraw(dst, shift); break;
1589 
1590     case Op_LShiftVB:  // fall-through
1591     case Op_LShiftVS:  psllw(dst, shift);   break;
1592 
1593     case Op_URShiftVS: // fall-through
1594     case Op_URShiftVB: psrlw(dst, shift);  break;
1595 
1596     default: assert(false, "%s", NodeClassNames[opcode]);
1597   }
1598 }
1599 
1600 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1601   switch (opcode) {
1602     case Op_RShiftVB:  // fall-through
1603     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1604 
1605     case Op_LShiftVB:  // fall-through
1606     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1607 
1608     case Op_URShiftVS: // fall-through
1609     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1610 
1611     default: assert(false, "%s", NodeClassNames[opcode]);
1612   }
1613 }
1614 
1615 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1616   switch (opcode) {
1617     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1618     case Op_LShiftVL:  psllq(dst, shift); break;
1619     case Op_URShiftVL: psrlq(dst, shift); break;
1620 
1621     default: assert(false, "%s", NodeClassNames[opcode]);
1622   }
1623 }
1624 
1625 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1626   if (opcode == Op_RShiftVL) {
1627     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1628   } else if (opcode == Op_LShiftVL) {
1629     psllq(dst, shift);
1630   } else {
1631     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1632     psrlq(dst, shift);
1633   }
1634 }
1635 
1636 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1637   switch (opcode) {
1638     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1639     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1640     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1641 
1642     default: assert(false, "%s", NodeClassNames[opcode]);
1643   }
1644 }
1645 
1646 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1647   if (opcode == Op_RShiftVL) {
1648     evpsraq(dst, nds, shift, vector_len);
1649   } else if (opcode == Op_LShiftVL) {
1650     vpsllq(dst, nds, shift, vector_len);
1651   } else {
1652     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1653     vpsrlq(dst, nds, shift, vector_len);
1654   }
1655 }
1656 
1657 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1658   switch (opcode) {
1659     case Op_RShiftVB:  // fall-through
1660     case Op_RShiftVS:  // fall-through
1661     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1662 
1663     case Op_LShiftVB:  // fall-through
1664     case Op_LShiftVS:  // fall-through
1665     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1666 
1667     case Op_URShiftVB: // fall-through
1668     case Op_URShiftVS: // fall-through
1669     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1670 
1671     default: assert(false, "%s", NodeClassNames[opcode]);
1672   }
1673 }
1674 
1675 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1676   switch (opcode) {
1677     case Op_RShiftVB:  // fall-through
1678     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1679 
1680     case Op_LShiftVB:  // fall-through
1681     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1682 
1683     case Op_URShiftVB: // fall-through
1684     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1685 
1686     default: assert(false, "%s", NodeClassNames[opcode]);
1687   }
1688 }
1689 
1690 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1691   assert(UseAVX >= 2, "required");
1692   switch (opcode) {
1693     case Op_RShiftVL: {
1694       if (UseAVX > 2) {
1695         assert(tmp == xnoreg, "not used");
1696         if (!VM_Version::supports_avx512vl()) {
1697           vlen_enc = Assembler::AVX_512bit;
1698         }
1699         evpsravq(dst, src, shift, vlen_enc);
1700       } else {
1701         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1702         vpsrlvq(dst, src, shift, vlen_enc);
1703         vpsrlvq(tmp, tmp, shift, vlen_enc);
1704         vpxor(dst, dst, tmp, vlen_enc);
1705         vpsubq(dst, dst, tmp, vlen_enc);
1706       }
1707       break;
1708     }
1709     case Op_LShiftVL: {
1710       assert(tmp == xnoreg, "not used");
1711       vpsllvq(dst, src, shift, vlen_enc);
1712       break;
1713     }
1714     case Op_URShiftVL: {
1715       assert(tmp == xnoreg, "not used");
1716       vpsrlvq(dst, src, shift, vlen_enc);
1717       break;
1718     }
1719     default: assert(false, "%s", NodeClassNames[opcode]);
1720   }
1721 }
1722 
1723 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1724 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1725   assert(opcode == Op_LShiftVB ||
1726          opcode == Op_RShiftVB ||
1727          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1728   bool sign = (opcode != Op_URShiftVB);
1729   assert(vector_len == 0, "required");
1730   vextendbd(sign, dst, src, 1);
1731   vpmovzxbd(vtmp, shift, 1);
1732   varshiftd(opcode, dst, dst, vtmp, 1);
1733   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1734   vextracti128_high(vtmp, dst);
1735   vpackusdw(dst, dst, vtmp, 0);
1736 }
1737 
1738 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1739 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1740   assert(opcode == Op_LShiftVB ||
1741          opcode == Op_RShiftVB ||
1742          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1743   bool sign = (opcode != Op_URShiftVB);
1744   int ext_vector_len = vector_len + 1;
1745   vextendbw(sign, dst, src, ext_vector_len);
1746   vpmovzxbw(vtmp, shift, ext_vector_len);
1747   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1748   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1749   if (vector_len == 0) {
1750     vextracti128_high(vtmp, dst);
1751     vpackuswb(dst, dst, vtmp, vector_len);
1752   } else {
1753     vextracti64x4_high(vtmp, dst);
1754     vpackuswb(dst, dst, vtmp, vector_len);
1755     vpermq(dst, dst, 0xD8, vector_len);
1756   }
1757 }
1758 
1759 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1760   switch(typ) {
1761     case T_BYTE:
1762       pinsrb(dst, val, idx);
1763       break;
1764     case T_SHORT:
1765       pinsrw(dst, val, idx);
1766       break;
1767     case T_INT:
1768       pinsrd(dst, val, idx);
1769       break;
1770     case T_LONG:
1771       pinsrq(dst, val, idx);
1772       break;
1773     default:
1774       assert(false,"Should not reach here.");
1775       break;
1776   }
1777 }
1778 
1779 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1780   switch(typ) {
1781     case T_BYTE:
1782       vpinsrb(dst, src, val, idx);
1783       break;
1784     case T_SHORT:
1785       vpinsrw(dst, src, val, idx);
1786       break;
1787     case T_INT:
1788       vpinsrd(dst, src, val, idx);
1789       break;
1790     case T_LONG:
1791       vpinsrq(dst, src, val, idx);
1792       break;
1793     default:
1794       assert(false,"Should not reach here.");
1795       break;
1796   }
1797 }
1798 
1799 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1800   switch(typ) {
1801     case T_INT:
1802       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1803       break;
1804     case T_FLOAT:
1805       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1806       break;
1807     case T_LONG:
1808       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1809       break;
1810     case T_DOUBLE:
1811       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1812       break;
1813     default:
1814       assert(false,"Should not reach here.");
1815       break;
1816   }
1817 }
1818 
1819 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1820   switch(typ) {
1821     case T_INT:
1822       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1823       break;
1824     case T_FLOAT:
1825       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1826       break;
1827     case T_LONG:
1828       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1829       break;
1830     case T_DOUBLE:
1831       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1832       break;
1833     default:
1834       assert(false,"Should not reach here.");
1835       break;
1836   }
1837 }
1838 
1839 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1840   switch(typ) {
1841     case T_INT:
1842       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1843       break;
1844     case T_FLOAT:
1845       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1846       break;
1847     case T_LONG:
1848       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1849       break;
1850     case T_DOUBLE:
1851       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1852       break;
1853     default:
1854       assert(false,"Should not reach here.");
1855       break;
1856   }
1857 }
1858 
1859 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1860   if (vlen_in_bytes <= 16) {
1861     pxor (dst, dst);
1862     psubb(dst, src);
1863     switch (elem_bt) {
1864       case T_BYTE:   /* nothing to do */ break;
1865       case T_SHORT:  pmovsxbw(dst, dst); break;
1866       case T_INT:    pmovsxbd(dst, dst); break;
1867       case T_FLOAT:  pmovsxbd(dst, dst); break;
1868       case T_LONG:   pmovsxbq(dst, dst); break;
1869       case T_DOUBLE: pmovsxbq(dst, dst); break;
1870 
1871       default: assert(false, "%s", type2name(elem_bt));
1872     }
1873   } else {
1874     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1875     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1876 
1877     vpxor (dst, dst, dst, vlen_enc);
1878     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1879 
1880     switch (elem_bt) {
1881       case T_BYTE:   /* nothing to do */            break;
1882       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1883       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1884       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1885       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1886       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1887 
1888       default: assert(false, "%s", type2name(elem_bt));
1889     }
1890   }
1891 }
1892 
1893 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1894   if (novlbwdq) {
1895     vpmovsxbd(xtmp, src, vlen_enc);
1896     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1897             Assembler::eq, true, vlen_enc, noreg);
1898   } else {
1899     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1900     vpsubb(xtmp, xtmp, src, vlen_enc);
1901     evpmovb2m(dst, xtmp, vlen_enc);
1902   }
1903 }
1904 
1905 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
1906   switch (vlen_in_bytes) {
1907     case 4:  movdl(dst, src);   break;
1908     case 8:  movq(dst, src);    break;
1909     case 16: movdqu(dst, src);  break;
1910     case 32: vmovdqu(dst, src); break;
1911     case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1912     default: ShouldNotReachHere();
1913   }
1914 }
1915 
1916 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1917   assert(rscratch != noreg || always_reachable(src), "missing");
1918 
1919   if (reachable(src)) {
1920     load_vector(dst, as_Address(src), vlen_in_bytes);
1921   } else {
1922     lea(rscratch, src);
1923     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
1924   }
1925 }
1926 
1927 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1928   int vlen_enc = vector_length_encoding(vlen);
1929   if (VM_Version::supports_avx()) {
1930     if (bt == T_LONG) {
1931       if (VM_Version::supports_avx2()) {
1932         vpbroadcastq(dst, src, vlen_enc);
1933       } else {
1934         vmovddup(dst, src, vlen_enc);
1935       }
1936     } else if (bt == T_DOUBLE) {
1937       if (vlen_enc != Assembler::AVX_128bit) {
1938         vbroadcastsd(dst, src, vlen_enc, noreg);
1939       } else {
1940         vmovddup(dst, src, vlen_enc);
1941       }
1942     } else {
1943       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1944         vpbroadcastd(dst, src, vlen_enc);
1945       } else {
1946         vbroadcastss(dst, src, vlen_enc);
1947       }
1948     }
1949   } else if (VM_Version::supports_sse3()) {
1950     movddup(dst, src);
1951   } else {
1952     movq(dst, src);
1953     if (vlen == 16) {
1954       punpcklqdq(dst, dst);
1955     }
1956   }
1957 }
1958 
1959 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1960   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1961   int offset = exact_log2(type2aelembytes(bt)) << 6;
1962   if (is_floating_point_type(bt)) {
1963     offset += 128;
1964   }
1965   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1966   load_vector(dst, addr, vlen_in_bytes);
1967 }
1968 
1969 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1970 
1971 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1972   int vector_len = Assembler::AVX_128bit;
1973 
1974   switch (opcode) {
1975     case Op_AndReductionV:  pand(dst, src); break;
1976     case Op_OrReductionV:   por (dst, src); break;
1977     case Op_XorReductionV:  pxor(dst, src); break;
1978     case Op_MinReductionV:
1979       switch (typ) {
1980         case T_BYTE:        pminsb(dst, src); break;
1981         case T_SHORT:       pminsw(dst, src); break;
1982         case T_INT:         pminsd(dst, src); break;
1983         case T_LONG:        assert(UseAVX > 2, "required");
1984                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1985         default:            assert(false, "wrong type");
1986       }
1987       break;
1988     case Op_MaxReductionV:
1989       switch (typ) {
1990         case T_BYTE:        pmaxsb(dst, src); break;
1991         case T_SHORT:       pmaxsw(dst, src); break;
1992         case T_INT:         pmaxsd(dst, src); break;
1993         case T_LONG:        assert(UseAVX > 2, "required");
1994                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1995         default:            assert(false, "wrong type");
1996       }
1997       break;
1998     case Op_AddReductionVF: addss(dst, src); break;
1999     case Op_AddReductionVD: addsd(dst, src); break;
2000     case Op_AddReductionVI:
2001       switch (typ) {
2002         case T_BYTE:        paddb(dst, src); break;
2003         case T_SHORT:       paddw(dst, src); break;
2004         case T_INT:         paddd(dst, src); break;
2005         default:            assert(false, "wrong type");
2006       }
2007       break;
2008     case Op_AddReductionVL: paddq(dst, src); break;
2009     case Op_MulReductionVF: mulss(dst, src); break;
2010     case Op_MulReductionVD: mulsd(dst, src); break;
2011     case Op_MulReductionVI:
2012       switch (typ) {
2013         case T_SHORT:       pmullw(dst, src); break;
2014         case T_INT:         pmulld(dst, src); break;
2015         default:            assert(false, "wrong type");
2016       }
2017       break;
2018     case Op_MulReductionVL: assert(UseAVX > 2, "required");
2019                             evpmullq(dst, dst, src, vector_len); break;
2020     default:                assert(false, "wrong opcode");
2021   }
2022 }
2023 
2024 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
2025   int vector_len = Assembler::AVX_256bit;
2026 
2027   switch (opcode) {
2028     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
2029     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
2030     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
2031     case Op_MinReductionV:
2032       switch (typ) {
2033         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
2034         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
2035         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
2036         case T_LONG:        assert(UseAVX > 2, "required");
2037                             vpminsq(dst, src1, src2, vector_len); break;
2038         default:            assert(false, "wrong type");
2039       }
2040       break;
2041     case Op_MaxReductionV:
2042       switch (typ) {
2043         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
2044         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
2045         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
2046         case T_LONG:        assert(UseAVX > 2, "required");
2047                             vpmaxsq(dst, src1, src2, vector_len); break;
2048         default:            assert(false, "wrong type");
2049       }
2050       break;
2051     case Op_AddReductionVI:
2052       switch (typ) {
2053         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
2054         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
2055         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
2056         default:            assert(false, "wrong type");
2057       }
2058       break;
2059     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
2060     case Op_MulReductionVI:
2061       switch (typ) {
2062         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
2063         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
2064         default:            assert(false, "wrong type");
2065       }
2066       break;
2067     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
2068     default:                assert(false, "wrong opcode");
2069   }
2070 }
2071 
2072 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
2073                                   XMMRegister dst, XMMRegister src,
2074                                   XMMRegister vtmp1, XMMRegister vtmp2) {
2075   switch (opcode) {
2076     case Op_AddReductionVF:
2077     case Op_MulReductionVF:
2078       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
2079       break;
2080 
2081     case Op_AddReductionVD:
2082     case Op_MulReductionVD:
2083       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
2084       break;
2085 
2086     default: assert(false, "wrong opcode");
2087   }
2088 }
2089 
2090 void C2_MacroAssembler::reduceB(int opcode, int vlen,
2091                              Register dst, Register src1, XMMRegister src2,
2092                              XMMRegister vtmp1, XMMRegister vtmp2) {
2093   switch (vlen) {
2094     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2095     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2096     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2097     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2098 
2099     default: assert(false, "wrong vector length");
2100   }
2101 }
2102 
2103 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
2104                              Register dst, Register src1, XMMRegister src2,
2105                              XMMRegister vtmp1, XMMRegister vtmp2) {
2106   switch (vlen) {
2107     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2108     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2109     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2110     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2111 
2112     default: assert(false, "wrong vector length");
2113   }
2114 }
2115 
2116 void C2_MacroAssembler::reduceS(int opcode, int vlen,
2117                              Register dst, Register src1, XMMRegister src2,
2118                              XMMRegister vtmp1, XMMRegister vtmp2) {
2119   switch (vlen) {
2120     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2121     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2122     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2123     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2124 
2125     default: assert(false, "wrong vector length");
2126   }
2127 }
2128 
2129 void C2_MacroAssembler::reduceI(int opcode, int vlen,
2130                              Register dst, Register src1, XMMRegister src2,
2131                              XMMRegister vtmp1, XMMRegister vtmp2) {
2132   switch (vlen) {
2133     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2134     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2135     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2136     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2137 
2138     default: assert(false, "wrong vector length");
2139   }
2140 }
2141 
2142 #ifdef _LP64
2143 void C2_MacroAssembler::reduceL(int opcode, int vlen,
2144                              Register dst, Register src1, XMMRegister src2,
2145                              XMMRegister vtmp1, XMMRegister vtmp2) {
2146   switch (vlen) {
2147     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2148     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2149     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2150 
2151     default: assert(false, "wrong vector length");
2152   }
2153 }
2154 #endif // _LP64
2155 
2156 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2157   switch (vlen) {
2158     case 2:
2159       assert(vtmp2 == xnoreg, "");
2160       reduce2F(opcode, dst, src, vtmp1);
2161       break;
2162     case 4:
2163       assert(vtmp2 == xnoreg, "");
2164       reduce4F(opcode, dst, src, vtmp1);
2165       break;
2166     case 8:
2167       reduce8F(opcode, dst, src, vtmp1, vtmp2);
2168       break;
2169     case 16:
2170       reduce16F(opcode, dst, src, vtmp1, vtmp2);
2171       break;
2172     default: assert(false, "wrong vector length");
2173   }
2174 }
2175 
2176 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2177   switch (vlen) {
2178     case 2:
2179       assert(vtmp2 == xnoreg, "");
2180       reduce2D(opcode, dst, src, vtmp1);
2181       break;
2182     case 4:
2183       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2184       break;
2185     case 8:
2186       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2187       break;
2188     default: assert(false, "wrong vector length");
2189   }
2190 }
2191 
2192 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2193   if (opcode == Op_AddReductionVI) {
2194     if (vtmp1 != src2) {
2195       movdqu(vtmp1, src2);
2196     }
2197     phaddd(vtmp1, vtmp1);
2198   } else {
2199     pshufd(vtmp1, src2, 0x1);
2200     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2201   }
2202   movdl(vtmp2, src1);
2203   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2204   movdl(dst, vtmp1);
2205 }
2206 
2207 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2208   if (opcode == Op_AddReductionVI) {
2209     if (vtmp1 != src2) {
2210       movdqu(vtmp1, src2);
2211     }
2212     phaddd(vtmp1, src2);
2213     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2214   } else {
2215     pshufd(vtmp2, src2, 0xE);
2216     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2217     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2218   }
2219 }
2220 
2221 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2222   if (opcode == Op_AddReductionVI) {
2223     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2224     vextracti128_high(vtmp2, vtmp1);
2225     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2226     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2227   } else {
2228     vextracti128_high(vtmp1, src2);
2229     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2230     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2231   }
2232 }
2233 
2234 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2235   vextracti64x4_high(vtmp2, src2);
2236   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2237   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2238 }
2239 
2240 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2241   pshufd(vtmp2, src2, 0x1);
2242   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2243   movdqu(vtmp1, vtmp2);
2244   psrldq(vtmp1, 2);
2245   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2246   movdqu(vtmp2, vtmp1);
2247   psrldq(vtmp2, 1);
2248   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2249   movdl(vtmp2, src1);
2250   pmovsxbd(vtmp1, vtmp1);
2251   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2252   pextrb(dst, vtmp1, 0x0);
2253   movsbl(dst, dst);
2254 }
2255 
2256 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2257   pshufd(vtmp1, src2, 0xE);
2258   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2259   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2260 }
2261 
2262 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2263   vextracti128_high(vtmp2, src2);
2264   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2265   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2266 }
2267 
2268 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2269   vextracti64x4_high(vtmp1, src2);
2270   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2271   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2272 }
2273 
2274 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2275   pmovsxbw(vtmp2, src2);
2276   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2277 }
2278 
2279 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2280   if (UseAVX > 1) {
2281     int vector_len = Assembler::AVX_256bit;
2282     vpmovsxbw(vtmp1, src2, vector_len);
2283     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2284   } else {
2285     pmovsxbw(vtmp2, src2);
2286     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2287     pshufd(vtmp2, src2, 0x1);
2288     pmovsxbw(vtmp2, src2);
2289     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2290   }
2291 }
2292 
2293 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2294   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2295     int vector_len = Assembler::AVX_512bit;
2296     vpmovsxbw(vtmp1, src2, vector_len);
2297     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2298   } else {
2299     assert(UseAVX >= 2,"Should not reach here.");
2300     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2301     vextracti128_high(vtmp2, src2);
2302     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2303   }
2304 }
2305 
2306 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2307   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2308   vextracti64x4_high(vtmp2, src2);
2309   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2310 }
2311 
2312 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2313   if (opcode == Op_AddReductionVI) {
2314     if (vtmp1 != src2) {
2315       movdqu(vtmp1, src2);
2316     }
2317     phaddw(vtmp1, vtmp1);
2318     phaddw(vtmp1, vtmp1);
2319   } else {
2320     pshufd(vtmp2, src2, 0x1);
2321     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2322     movdqu(vtmp1, vtmp2);
2323     psrldq(vtmp1, 2);
2324     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2325   }
2326   movdl(vtmp2, src1);
2327   pmovsxwd(vtmp1, vtmp1);
2328   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2329   pextrw(dst, vtmp1, 0x0);
2330   movswl(dst, dst);
2331 }
2332 
2333 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2334   if (opcode == Op_AddReductionVI) {
2335     if (vtmp1 != src2) {
2336       movdqu(vtmp1, src2);
2337     }
2338     phaddw(vtmp1, src2);
2339   } else {
2340     pshufd(vtmp1, src2, 0xE);
2341     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2342   }
2343   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2344 }
2345 
2346 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2347   if (opcode == Op_AddReductionVI) {
2348     int vector_len = Assembler::AVX_256bit;
2349     vphaddw(vtmp2, src2, src2, vector_len);
2350     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2351   } else {
2352     vextracti128_high(vtmp2, src2);
2353     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2354   }
2355   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2356 }
2357 
2358 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2359   int vector_len = Assembler::AVX_256bit;
2360   vextracti64x4_high(vtmp1, src2);
2361   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2362   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2363 }
2364 
2365 #ifdef _LP64
2366 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2367   pshufd(vtmp2, src2, 0xE);
2368   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2369   movdq(vtmp1, src1);
2370   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2371   movdq(dst, vtmp1);
2372 }
2373 
2374 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2375   vextracti128_high(vtmp1, src2);
2376   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2377   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2378 }
2379 
2380 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2381   vextracti64x4_high(vtmp2, src2);
2382   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2383   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2384 }
2385 
2386 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2387   mov64(temp, -1L);
2388   bzhiq(temp, temp, len);
2389   kmovql(dst, temp);
2390 }
2391 #endif // _LP64
2392 
2393 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2394   reduce_operation_128(T_FLOAT, opcode, dst, src);
2395   pshufd(vtmp, src, 0x1);
2396   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2397 }
2398 
2399 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2400   reduce2F(opcode, dst, src, vtmp);
2401   pshufd(vtmp, src, 0x2);
2402   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2403   pshufd(vtmp, src, 0x3);
2404   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2405 }
2406 
2407 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2408   reduce4F(opcode, dst, src, vtmp2);
2409   vextractf128_high(vtmp2, src);
2410   reduce4F(opcode, dst, vtmp2, vtmp1);
2411 }
2412 
2413 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2414   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2415   vextracti64x4_high(vtmp1, src);
2416   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2417 }
2418 
2419 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2420   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2421   pshufd(vtmp, src, 0xE);
2422   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2423 }
2424 
2425 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2426   reduce2D(opcode, dst, src, vtmp2);
2427   vextractf128_high(vtmp2, src);
2428   reduce2D(opcode, dst, vtmp2, vtmp1);
2429 }
2430 
2431 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2432   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2433   vextracti64x4_high(vtmp1, src);
2434   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2435 }
2436 
2437 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2438   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2439 }
2440 
2441 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2442   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2443 }
2444 
2445 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2446                                  int vec_enc) {
2447   switch(elem_bt) {
2448     case T_INT:
2449     case T_FLOAT:
2450       vmaskmovps(dst, src, mask, vec_enc);
2451       break;
2452     case T_LONG:
2453     case T_DOUBLE:
2454       vmaskmovpd(dst, src, mask, vec_enc);
2455       break;
2456     default:
2457       fatal("Unsupported type %s", type2name(elem_bt));
2458       break;
2459   }
2460 }
2461 
2462 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2463                                  int vec_enc) {
2464   switch(elem_bt) {
2465     case T_INT:
2466     case T_FLOAT:
2467       vmaskmovps(dst, src, mask, vec_enc);
2468       break;
2469     case T_LONG:
2470     case T_DOUBLE:
2471       vmaskmovpd(dst, src, mask, vec_enc);
2472       break;
2473     default:
2474       fatal("Unsupported type %s", type2name(elem_bt));
2475       break;
2476   }
2477 }
2478 
2479 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2480                                           XMMRegister dst, XMMRegister src,
2481                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2482                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2483   const int permconst[] = {1, 14};
2484   XMMRegister wsrc = src;
2485   XMMRegister wdst = xmm_0;
2486   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2487 
2488   int vlen_enc = Assembler::AVX_128bit;
2489   if (vlen == 16) {
2490     vlen_enc = Assembler::AVX_256bit;
2491   }
2492 
2493   for (int i = log2(vlen) - 1; i >=0; i--) {
2494     if (i == 0 && !is_dst_valid) {
2495       wdst = dst;
2496     }
2497     if (i == 3) {
2498       vextracti64x4_high(wtmp, wsrc);
2499     } else if (i == 2) {
2500       vextracti128_high(wtmp, wsrc);
2501     } else { // i = [0,1]
2502       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2503     }
2504     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2505     wsrc = wdst;
2506     vlen_enc = Assembler::AVX_128bit;
2507   }
2508   if (is_dst_valid) {
2509     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2510   }
2511 }
2512 
2513 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2514                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2515                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2516   XMMRegister wsrc = src;
2517   XMMRegister wdst = xmm_0;
2518   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2519   int vlen_enc = Assembler::AVX_128bit;
2520   if (vlen == 8) {
2521     vlen_enc = Assembler::AVX_256bit;
2522   }
2523   for (int i = log2(vlen) - 1; i >=0; i--) {
2524     if (i == 0 && !is_dst_valid) {
2525       wdst = dst;
2526     }
2527     if (i == 1) {
2528       vextracti128_high(wtmp, wsrc);
2529     } else if (i == 2) {
2530       vextracti64x4_high(wtmp, wsrc);
2531     } else {
2532       assert(i == 0, "%d", i);
2533       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2534     }
2535     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2536     wsrc = wdst;
2537     vlen_enc = Assembler::AVX_128bit;
2538   }
2539   if (is_dst_valid) {
2540     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2541   }
2542 }
2543 
2544 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2545   switch (bt) {
2546     case T_BYTE:  pextrb(dst, src, idx); break;
2547     case T_SHORT: pextrw(dst, src, idx); break;
2548     case T_INT:   pextrd(dst, src, idx); break;
2549     case T_LONG:  pextrq(dst, src, idx); break;
2550 
2551     default:
2552       assert(false,"Should not reach here.");
2553       break;
2554   }
2555 }
2556 
2557 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2558   int esize =  type2aelembytes(typ);
2559   int elem_per_lane = 16/esize;
2560   int lane = elemindex / elem_per_lane;
2561   int eindex = elemindex % elem_per_lane;
2562 
2563   if (lane >= 2) {
2564     assert(UseAVX > 2, "required");
2565     vextractf32x4(dst, src, lane & 3);
2566     return dst;
2567   } else if (lane > 0) {
2568     assert(UseAVX > 0, "required");
2569     vextractf128(dst, src, lane);
2570     return dst;
2571   } else {
2572     return src;
2573   }
2574 }
2575 
2576 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2577   if (typ == T_BYTE) {
2578     movsbl(dst, dst);
2579   } else if (typ == T_SHORT) {
2580     movswl(dst, dst);
2581   }
2582 }
2583 
2584 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2585   int esize =  type2aelembytes(typ);
2586   int elem_per_lane = 16/esize;
2587   int eindex = elemindex % elem_per_lane;
2588   assert(is_integral_type(typ),"required");
2589 
2590   if (eindex == 0) {
2591     if (typ == T_LONG) {
2592       movq(dst, src);
2593     } else {
2594       movdl(dst, src);
2595       movsxl(typ, dst);
2596     }
2597   } else {
2598     extract(typ, dst, src, eindex);
2599     movsxl(typ, dst);
2600   }
2601 }
2602 
2603 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2604   int esize =  type2aelembytes(typ);
2605   int elem_per_lane = 16/esize;
2606   int eindex = elemindex % elem_per_lane;
2607   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2608 
2609   if (eindex == 0) {
2610     movq(dst, src);
2611   } else {
2612     if (typ == T_FLOAT) {
2613       if (UseAVX == 0) {
2614         movdqu(dst, src);
2615         shufps(dst, dst, eindex);
2616       } else {
2617         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2618       }
2619     } else {
2620       if (UseAVX == 0) {
2621         movdqu(dst, src);
2622         psrldq(dst, eindex*esize);
2623       } else {
2624         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2625       }
2626       movq(dst, dst);
2627     }
2628   }
2629   // Zero upper bits
2630   if (typ == T_FLOAT) {
2631     if (UseAVX == 0) {
2632       assert(vtmp != xnoreg, "required.");
2633       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2634       pand(dst, vtmp);
2635     } else {
2636       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2637     }
2638   }
2639 }
2640 
2641 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2642   switch(typ) {
2643     case T_BYTE:
2644     case T_BOOLEAN:
2645       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2646       break;
2647     case T_SHORT:
2648     case T_CHAR:
2649       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2650       break;
2651     case T_INT:
2652     case T_FLOAT:
2653       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2654       break;
2655     case T_LONG:
2656     case T_DOUBLE:
2657       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2658       break;
2659     default:
2660       assert(false,"Should not reach here.");
2661       break;
2662   }
2663 }
2664 
2665 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2666   assert(rscratch != noreg || always_reachable(src2), "missing");
2667 
2668   switch(typ) {
2669     case T_BOOLEAN:
2670     case T_BYTE:
2671       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2672       break;
2673     case T_CHAR:
2674     case T_SHORT:
2675       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2676       break;
2677     case T_INT:
2678     case T_FLOAT:
2679       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2680       break;
2681     case T_LONG:
2682     case T_DOUBLE:
2683       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2684       break;
2685     default:
2686       assert(false,"Should not reach here.");
2687       break;
2688   }
2689 }
2690 
2691 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2692   switch(typ) {
2693     case T_BYTE:
2694       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2695       break;
2696     case T_SHORT:
2697       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2698       break;
2699     case T_INT:
2700     case T_FLOAT:
2701       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2702       break;
2703     case T_LONG:
2704     case T_DOUBLE:
2705       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2706       break;
2707     default:
2708       assert(false,"Should not reach here.");
2709       break;
2710   }
2711 }
2712 
2713 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2714   assert(vlen_in_bytes <= 32, "");
2715   int esize = type2aelembytes(bt);
2716   if (vlen_in_bytes == 32) {
2717     assert(vtmp == xnoreg, "required.");
2718     if (esize >= 4) {
2719       vtestps(src1, src2, AVX_256bit);
2720     } else {
2721       vptest(src1, src2, AVX_256bit);
2722     }
2723     return;
2724   }
2725   if (vlen_in_bytes < 16) {
2726     // Duplicate the lower part to fill the whole register,
2727     // Don't need to do so for src2
2728     assert(vtmp != xnoreg, "required");
2729     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2730     pshufd(vtmp, src1, shuffle_imm);
2731   } else {
2732     assert(vtmp == xnoreg, "required");
2733     vtmp = src1;
2734   }
2735   if (esize >= 4 && VM_Version::supports_avx()) {
2736     vtestps(vtmp, src2, AVX_128bit);
2737   } else {
2738     ptest(vtmp, src2);
2739   }
2740 }
2741 
2742 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2743   assert(UseAVX >= 2, "required");
2744 #ifdef ASSERT
2745   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2746   bool is_bw_supported = VM_Version::supports_avx512bw();
2747   if (is_bw && !is_bw_supported) {
2748     assert(vlen_enc != Assembler::AVX_512bit, "required");
2749     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2750            "XMM register should be 0-15");
2751   }
2752 #endif // ASSERT
2753   switch (elem_bt) {
2754     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2755     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2756     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2757     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2758     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2759     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2760     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2761   }
2762 }
2763 
2764 #ifdef _LP64
2765 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2766   assert(UseAVX >= 2, "required");
2767   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2768   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2769   if ((UseAVX > 2) &&
2770       (!is_bw || VM_Version::supports_avx512bw()) &&
2771       (!is_vl || VM_Version::supports_avx512vl())) {
2772     switch (elem_bt) {
2773       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2774       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2775       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2776       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2777       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2778     }
2779   } else {
2780     assert(vlen_enc != Assembler::AVX_512bit, "required");
2781     assert((dst->encoding() < 16),"XMM register should be 0-15");
2782     switch (elem_bt) {
2783       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2784       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2785       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2786       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2787       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2788       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2789       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2790     }
2791   }
2792 }
2793 #endif
2794 
2795 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2796   switch (to_elem_bt) {
2797     case T_SHORT:
2798       vpmovsxbw(dst, src, vlen_enc);
2799       break;
2800     case T_INT:
2801       vpmovsxbd(dst, src, vlen_enc);
2802       break;
2803     case T_FLOAT:
2804       vpmovsxbd(dst, src, vlen_enc);
2805       vcvtdq2ps(dst, dst, vlen_enc);
2806       break;
2807     case T_LONG:
2808       vpmovsxbq(dst, src, vlen_enc);
2809       break;
2810     case T_DOUBLE: {
2811       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2812       vpmovsxbd(dst, src, mid_vlen_enc);
2813       vcvtdq2pd(dst, dst, vlen_enc);
2814       break;
2815     }
2816     default:
2817       fatal("Unsupported type %s", type2name(to_elem_bt));
2818       break;
2819   }
2820 }
2821 
2822 //-------------------------------------------------------------------------------------------
2823 
2824 // IndexOf for constant substrings with size >= 8 chars
2825 // which don't need to be loaded through stack.
2826 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2827                                          Register cnt1, Register cnt2,
2828                                          int int_cnt2,  Register result,
2829                                          XMMRegister vec, Register tmp,
2830                                          int ae) {
2831   ShortBranchVerifier sbv(this);
2832   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2833   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2834 
2835   // This method uses the pcmpestri instruction with bound registers
2836   //   inputs:
2837   //     xmm - substring
2838   //     rax - substring length (elements count)
2839   //     mem - scanned string
2840   //     rdx - string length (elements count)
2841   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2842   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2843   //   outputs:
2844   //     rcx - matched index in string
2845   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2846   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2847   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2848   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2849   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2850 
2851   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2852         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2853         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2854 
2855   // Note, inline_string_indexOf() generates checks:
2856   // if (substr.count > string.count) return -1;
2857   // if (substr.count == 0) return 0;
2858   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2859 
2860   // Load substring.
2861   if (ae == StrIntrinsicNode::UL) {
2862     pmovzxbw(vec, Address(str2, 0));
2863   } else {
2864     movdqu(vec, Address(str2, 0));
2865   }
2866   movl(cnt2, int_cnt2);
2867   movptr(result, str1); // string addr
2868 
2869   if (int_cnt2 > stride) {
2870     jmpb(SCAN_TO_SUBSTR);
2871 
2872     // Reload substr for rescan, this code
2873     // is executed only for large substrings (> 8 chars)
2874     bind(RELOAD_SUBSTR);
2875     if (ae == StrIntrinsicNode::UL) {
2876       pmovzxbw(vec, Address(str2, 0));
2877     } else {
2878       movdqu(vec, Address(str2, 0));
2879     }
2880     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2881 
2882     bind(RELOAD_STR);
2883     // We came here after the beginning of the substring was
2884     // matched but the rest of it was not so we need to search
2885     // again. Start from the next element after the previous match.
2886 
2887     // cnt2 is number of substring reminding elements and
2888     // cnt1 is number of string reminding elements when cmp failed.
2889     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2890     subl(cnt1, cnt2);
2891     addl(cnt1, int_cnt2);
2892     movl(cnt2, int_cnt2); // Now restore cnt2
2893 
2894     decrementl(cnt1);     // Shift to next element
2895     cmpl(cnt1, cnt2);
2896     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2897 
2898     addptr(result, (1<<scale1));
2899 
2900   } // (int_cnt2 > 8)
2901 
2902   // Scan string for start of substr in 16-byte vectors
2903   bind(SCAN_TO_SUBSTR);
2904   pcmpestri(vec, Address(result, 0), mode);
2905   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2906   subl(cnt1, stride);
2907   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2908   cmpl(cnt1, cnt2);
2909   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2910   addptr(result, 16);
2911   jmpb(SCAN_TO_SUBSTR);
2912 
2913   // Found a potential substr
2914   bind(FOUND_CANDIDATE);
2915   // Matched whole vector if first element matched (tmp(rcx) == 0).
2916   if (int_cnt2 == stride) {
2917     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2918   } else { // int_cnt2 > 8
2919     jccb(Assembler::overflow, FOUND_SUBSTR);
2920   }
2921   // After pcmpestri tmp(rcx) contains matched element index
2922   // Compute start addr of substr
2923   lea(result, Address(result, tmp, scale1));
2924 
2925   // Make sure string is still long enough
2926   subl(cnt1, tmp);
2927   cmpl(cnt1, cnt2);
2928   if (int_cnt2 == stride) {
2929     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2930   } else { // int_cnt2 > 8
2931     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2932   }
2933   // Left less then substring.
2934 
2935   bind(RET_NOT_FOUND);
2936   movl(result, -1);
2937   jmp(EXIT);
2938 
2939   if (int_cnt2 > stride) {
2940     // This code is optimized for the case when whole substring
2941     // is matched if its head is matched.
2942     bind(MATCH_SUBSTR_HEAD);
2943     pcmpestri(vec, Address(result, 0), mode);
2944     // Reload only string if does not match
2945     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2946 
2947     Label CONT_SCAN_SUBSTR;
2948     // Compare the rest of substring (> 8 chars).
2949     bind(FOUND_SUBSTR);
2950     // First 8 chars are already matched.
2951     negptr(cnt2);
2952     addptr(cnt2, stride);
2953 
2954     bind(SCAN_SUBSTR);
2955     subl(cnt1, stride);
2956     cmpl(cnt2, -stride); // Do not read beyond substring
2957     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2958     // Back-up strings to avoid reading beyond substring:
2959     // cnt1 = cnt1 - cnt2 + 8
2960     addl(cnt1, cnt2); // cnt2 is negative
2961     addl(cnt1, stride);
2962     movl(cnt2, stride); negptr(cnt2);
2963     bind(CONT_SCAN_SUBSTR);
2964     if (int_cnt2 < (int)G) {
2965       int tail_off1 = int_cnt2<<scale1;
2966       int tail_off2 = int_cnt2<<scale2;
2967       if (ae == StrIntrinsicNode::UL) {
2968         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2969       } else {
2970         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2971       }
2972       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2973     } else {
2974       // calculate index in register to avoid integer overflow (int_cnt2*2)
2975       movl(tmp, int_cnt2);
2976       addptr(tmp, cnt2);
2977       if (ae == StrIntrinsicNode::UL) {
2978         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2979       } else {
2980         movdqu(vec, Address(str2, tmp, scale2, 0));
2981       }
2982       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2983     }
2984     // Need to reload strings pointers if not matched whole vector
2985     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2986     addptr(cnt2, stride);
2987     jcc(Assembler::negative, SCAN_SUBSTR);
2988     // Fall through if found full substring
2989 
2990   } // (int_cnt2 > 8)
2991 
2992   bind(RET_FOUND);
2993   // Found result if we matched full small substring.
2994   // Compute substr offset
2995   subptr(result, str1);
2996   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2997     shrl(result, 1); // index
2998   }
2999   bind(EXIT);
3000 
3001 } // string_indexofC8
3002 
3003 // Small strings are loaded through stack if they cross page boundary.
3004 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
3005                                        Register cnt1, Register cnt2,
3006                                        int int_cnt2,  Register result,
3007                                        XMMRegister vec, Register tmp,
3008                                        int ae) {
3009   ShortBranchVerifier sbv(this);
3010   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3011   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
3012 
3013   //
3014   // int_cnt2 is length of small (< 8 chars) constant substring
3015   // or (-1) for non constant substring in which case its length
3016   // is in cnt2 register.
3017   //
3018   // Note, inline_string_indexOf() generates checks:
3019   // if (substr.count > string.count) return -1;
3020   // if (substr.count == 0) return 0;
3021   //
3022   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
3023   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
3024   // This method uses the pcmpestri instruction with bound registers
3025   //   inputs:
3026   //     xmm - substring
3027   //     rax - substring length (elements count)
3028   //     mem - scanned string
3029   //     rdx - string length (elements count)
3030   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
3031   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
3032   //   outputs:
3033   //     rcx - matched index in string
3034   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3035   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
3036   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
3037   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
3038 
3039   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
3040         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
3041         FOUND_CANDIDATE;
3042 
3043   { //========================================================
3044     // We don't know where these strings are located
3045     // and we can't read beyond them. Load them through stack.
3046     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
3047 
3048     movptr(tmp, rsp); // save old SP
3049 
3050     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
3051       if (int_cnt2 == (1>>scale2)) { // One byte
3052         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
3053         load_unsigned_byte(result, Address(str2, 0));
3054         movdl(vec, result); // move 32 bits
3055       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
3056         // Not enough header space in 32-bit VM: 12+3 = 15.
3057         movl(result, Address(str2, -1));
3058         shrl(result, 8);
3059         movdl(vec, result); // move 32 bits
3060       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
3061         load_unsigned_short(result, Address(str2, 0));
3062         movdl(vec, result); // move 32 bits
3063       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
3064         movdl(vec, Address(str2, 0)); // move 32 bits
3065       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
3066         movq(vec, Address(str2, 0));  // move 64 bits
3067       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3068         // Array header size is 12 bytes in 32-bit VM
3069         // + 6 bytes for 3 chars == 18 bytes,
3070         // enough space to load vec and shift.
3071         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3072         if (ae == StrIntrinsicNode::UL) {
3073           int tail_off = int_cnt2-8;
3074           pmovzxbw(vec, Address(str2, tail_off));
3075           psrldq(vec, -2*tail_off);
3076         }
3077         else {
3078           int tail_off = int_cnt2*(1<<scale2);
3079           movdqu(vec, Address(str2, tail_off-16));
3080           psrldq(vec, 16-tail_off);
3081         }
3082       }
3083     } else { // not constant substring
3084       cmpl(cnt2, stride);
3085       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3086 
3087       // We can read beyond string if srt+16 does not cross page boundary
3088       // since heaps are aligned and mapped by pages.
3089       assert(os::vm_page_size() < (int)G, "default page should be small");
3090       movl(result, str2); // We need only low 32 bits
3091       andl(result, ((int)os::vm_page_size()-1));
3092       cmpl(result, ((int)os::vm_page_size()-16));
3093       jccb(Assembler::belowEqual, CHECK_STR);
3094 
3095       // Move small strings to stack to allow load 16 bytes into vec.
3096       subptr(rsp, 16);
3097       int stk_offset = wordSize-(1<<scale2);
3098       push(cnt2);
3099 
3100       bind(COPY_SUBSTR);
3101       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3102         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3103         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3104       } else if (ae == StrIntrinsicNode::UU) {
3105         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3106         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3107       }
3108       decrement(cnt2);
3109       jccb(Assembler::notZero, COPY_SUBSTR);
3110 
3111       pop(cnt2);
3112       movptr(str2, rsp);  // New substring address
3113     } // non constant
3114 
3115     bind(CHECK_STR);
3116     cmpl(cnt1, stride);
3117     jccb(Assembler::aboveEqual, BIG_STRINGS);
3118 
3119     // Check cross page boundary.
3120     movl(result, str1); // We need only low 32 bits
3121     andl(result, ((int)os::vm_page_size()-1));
3122     cmpl(result, ((int)os::vm_page_size()-16));
3123     jccb(Assembler::belowEqual, BIG_STRINGS);
3124 
3125     subptr(rsp, 16);
3126     int stk_offset = -(1<<scale1);
3127     if (int_cnt2 < 0) { // not constant
3128       push(cnt2);
3129       stk_offset += wordSize;
3130     }
3131     movl(cnt2, cnt1);
3132 
3133     bind(COPY_STR);
3134     if (ae == StrIntrinsicNode::LL) {
3135       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3136       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3137     } else {
3138       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3139       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3140     }
3141     decrement(cnt2);
3142     jccb(Assembler::notZero, COPY_STR);
3143 
3144     if (int_cnt2 < 0) { // not constant
3145       pop(cnt2);
3146     }
3147     movptr(str1, rsp);  // New string address
3148 
3149     bind(BIG_STRINGS);
3150     // Load substring.
3151     if (int_cnt2 < 0) { // -1
3152       if (ae == StrIntrinsicNode::UL) {
3153         pmovzxbw(vec, Address(str2, 0));
3154       } else {
3155         movdqu(vec, Address(str2, 0));
3156       }
3157       push(cnt2);       // substr count
3158       push(str2);       // substr addr
3159       push(str1);       // string addr
3160     } else {
3161       // Small (< 8 chars) constant substrings are loaded already.
3162       movl(cnt2, int_cnt2);
3163     }
3164     push(tmp);  // original SP
3165 
3166   } // Finished loading
3167 
3168   //========================================================
3169   // Start search
3170   //
3171 
3172   movptr(result, str1); // string addr
3173 
3174   if (int_cnt2  < 0) {  // Only for non constant substring
3175     jmpb(SCAN_TO_SUBSTR);
3176 
3177     // SP saved at sp+0
3178     // String saved at sp+1*wordSize
3179     // Substr saved at sp+2*wordSize
3180     // Substr count saved at sp+3*wordSize
3181 
3182     // Reload substr for rescan, this code
3183     // is executed only for large substrings (> 8 chars)
3184     bind(RELOAD_SUBSTR);
3185     movptr(str2, Address(rsp, 2*wordSize));
3186     movl(cnt2, Address(rsp, 3*wordSize));
3187     if (ae == StrIntrinsicNode::UL) {
3188       pmovzxbw(vec, Address(str2, 0));
3189     } else {
3190       movdqu(vec, Address(str2, 0));
3191     }
3192     // We came here after the beginning of the substring was
3193     // matched but the rest of it was not so we need to search
3194     // again. Start from the next element after the previous match.
3195     subptr(str1, result); // Restore counter
3196     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3197       shrl(str1, 1);
3198     }
3199     addl(cnt1, str1);
3200     decrementl(cnt1);   // Shift to next element
3201     cmpl(cnt1, cnt2);
3202     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3203 
3204     addptr(result, (1<<scale1));
3205   } // non constant
3206 
3207   // Scan string for start of substr in 16-byte vectors
3208   bind(SCAN_TO_SUBSTR);
3209   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3210   pcmpestri(vec, Address(result, 0), mode);
3211   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3212   subl(cnt1, stride);
3213   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3214   cmpl(cnt1, cnt2);
3215   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3216   addptr(result, 16);
3217 
3218   bind(ADJUST_STR);
3219   cmpl(cnt1, stride); // Do not read beyond string
3220   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3221   // Back-up string to avoid reading beyond string.
3222   lea(result, Address(result, cnt1, scale1, -16));
3223   movl(cnt1, stride);
3224   jmpb(SCAN_TO_SUBSTR);
3225 
3226   // Found a potential substr
3227   bind(FOUND_CANDIDATE);
3228   // After pcmpestri tmp(rcx) contains matched element index
3229 
3230   // Make sure string is still long enough
3231   subl(cnt1, tmp);
3232   cmpl(cnt1, cnt2);
3233   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3234   // Left less then substring.
3235 
3236   bind(RET_NOT_FOUND);
3237   movl(result, -1);
3238   jmp(CLEANUP);
3239 
3240   bind(FOUND_SUBSTR);
3241   // Compute start addr of substr
3242   lea(result, Address(result, tmp, scale1));
3243   if (int_cnt2 > 0) { // Constant substring
3244     // Repeat search for small substring (< 8 chars)
3245     // from new point without reloading substring.
3246     // Have to check that we don't read beyond string.
3247     cmpl(tmp, stride-int_cnt2);
3248     jccb(Assembler::greater, ADJUST_STR);
3249     // Fall through if matched whole substring.
3250   } else { // non constant
3251     assert(int_cnt2 == -1, "should be != 0");
3252 
3253     addl(tmp, cnt2);
3254     // Found result if we matched whole substring.
3255     cmpl(tmp, stride);
3256     jcc(Assembler::lessEqual, RET_FOUND);
3257 
3258     // Repeat search for small substring (<= 8 chars)
3259     // from new point 'str1' without reloading substring.
3260     cmpl(cnt2, stride);
3261     // Have to check that we don't read beyond string.
3262     jccb(Assembler::lessEqual, ADJUST_STR);
3263 
3264     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3265     // Compare the rest of substring (> 8 chars).
3266     movptr(str1, result);
3267 
3268     cmpl(tmp, cnt2);
3269     // First 8 chars are already matched.
3270     jccb(Assembler::equal, CHECK_NEXT);
3271 
3272     bind(SCAN_SUBSTR);
3273     pcmpestri(vec, Address(str1, 0), mode);
3274     // Need to reload strings pointers if not matched whole vector
3275     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3276 
3277     bind(CHECK_NEXT);
3278     subl(cnt2, stride);
3279     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3280     addptr(str1, 16);
3281     if (ae == StrIntrinsicNode::UL) {
3282       addptr(str2, 8);
3283     } else {
3284       addptr(str2, 16);
3285     }
3286     subl(cnt1, stride);
3287     cmpl(cnt2, stride); // Do not read beyond substring
3288     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3289     // Back-up strings to avoid reading beyond substring.
3290 
3291     if (ae == StrIntrinsicNode::UL) {
3292       lea(str2, Address(str2, cnt2, scale2, -8));
3293       lea(str1, Address(str1, cnt2, scale1, -16));
3294     } else {
3295       lea(str2, Address(str2, cnt2, scale2, -16));
3296       lea(str1, Address(str1, cnt2, scale1, -16));
3297     }
3298     subl(cnt1, cnt2);
3299     movl(cnt2, stride);
3300     addl(cnt1, stride);
3301     bind(CONT_SCAN_SUBSTR);
3302     if (ae == StrIntrinsicNode::UL) {
3303       pmovzxbw(vec, Address(str2, 0));
3304     } else {
3305       movdqu(vec, Address(str2, 0));
3306     }
3307     jmp(SCAN_SUBSTR);
3308 
3309     bind(RET_FOUND_LONG);
3310     movptr(str1, Address(rsp, wordSize));
3311   } // non constant
3312 
3313   bind(RET_FOUND);
3314   // Compute substr offset
3315   subptr(result, str1);
3316   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3317     shrl(result, 1); // index
3318   }
3319   bind(CLEANUP);
3320   pop(rsp); // restore SP
3321 
3322 } // string_indexof
3323 
3324 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3325                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3326   ShortBranchVerifier sbv(this);
3327   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3328 
3329   int stride = 8;
3330 
3331   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3332         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3333         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3334         FOUND_SEQ_CHAR, DONE_LABEL;
3335 
3336   movptr(result, str1);
3337   if (UseAVX >= 2) {
3338     cmpl(cnt1, stride);
3339     jcc(Assembler::less, SCAN_TO_CHAR);
3340     cmpl(cnt1, 2*stride);
3341     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3342     movdl(vec1, ch);
3343     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3344     vpxor(vec2, vec2);
3345     movl(tmp, cnt1);
3346     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3347     andl(cnt1,0x0000000F);  //tail count (in chars)
3348 
3349     bind(SCAN_TO_16_CHAR_LOOP);
3350     vmovdqu(vec3, Address(result, 0));
3351     vpcmpeqw(vec3, vec3, vec1, 1);
3352     vptest(vec2, vec3);
3353     jcc(Assembler::carryClear, FOUND_CHAR);
3354     addptr(result, 32);
3355     subl(tmp, 2*stride);
3356     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3357     jmp(SCAN_TO_8_CHAR);
3358     bind(SCAN_TO_8_CHAR_INIT);
3359     movdl(vec1, ch);
3360     pshuflw(vec1, vec1, 0x00);
3361     pshufd(vec1, vec1, 0);
3362     pxor(vec2, vec2);
3363   }
3364   bind(SCAN_TO_8_CHAR);
3365   cmpl(cnt1, stride);
3366   jcc(Assembler::less, SCAN_TO_CHAR);
3367   if (UseAVX < 2) {
3368     movdl(vec1, ch);
3369     pshuflw(vec1, vec1, 0x00);
3370     pshufd(vec1, vec1, 0);
3371     pxor(vec2, vec2);
3372   }
3373   movl(tmp, cnt1);
3374   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3375   andl(cnt1,0x00000007);  //tail count (in chars)
3376 
3377   bind(SCAN_TO_8_CHAR_LOOP);
3378   movdqu(vec3, Address(result, 0));
3379   pcmpeqw(vec3, vec1);
3380   ptest(vec2, vec3);
3381   jcc(Assembler::carryClear, FOUND_CHAR);
3382   addptr(result, 16);
3383   subl(tmp, stride);
3384   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3385   bind(SCAN_TO_CHAR);
3386   testl(cnt1, cnt1);
3387   jcc(Assembler::zero, RET_NOT_FOUND);
3388   bind(SCAN_TO_CHAR_LOOP);
3389   load_unsigned_short(tmp, Address(result, 0));
3390   cmpl(ch, tmp);
3391   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3392   addptr(result, 2);
3393   subl(cnt1, 1);
3394   jccb(Assembler::zero, RET_NOT_FOUND);
3395   jmp(SCAN_TO_CHAR_LOOP);
3396 
3397   bind(RET_NOT_FOUND);
3398   movl(result, -1);
3399   jmpb(DONE_LABEL);
3400 
3401   bind(FOUND_CHAR);
3402   if (UseAVX >= 2) {
3403     vpmovmskb(tmp, vec3);
3404   } else {
3405     pmovmskb(tmp, vec3);
3406   }
3407   bsfl(ch, tmp);
3408   addptr(result, ch);
3409 
3410   bind(FOUND_SEQ_CHAR);
3411   subptr(result, str1);
3412   shrl(result, 1);
3413 
3414   bind(DONE_LABEL);
3415 } // string_indexof_char
3416 
3417 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3418                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3419   ShortBranchVerifier sbv(this);
3420   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3421 
3422   int stride = 16;
3423 
3424   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3425         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3426         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3427         FOUND_SEQ_CHAR, DONE_LABEL;
3428 
3429   movptr(result, str1);
3430   if (UseAVX >= 2) {
3431     cmpl(cnt1, stride);
3432     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3433     cmpl(cnt1, stride*2);
3434     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3435     movdl(vec1, ch);
3436     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3437     vpxor(vec2, vec2);
3438     movl(tmp, cnt1);
3439     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3440     andl(cnt1,0x0000001F);  //tail count (in chars)
3441 
3442     bind(SCAN_TO_32_CHAR_LOOP);
3443     vmovdqu(vec3, Address(result, 0));
3444     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3445     vptest(vec2, vec3);
3446     jcc(Assembler::carryClear, FOUND_CHAR);
3447     addptr(result, 32);
3448     subl(tmp, stride*2);
3449     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3450     jmp(SCAN_TO_16_CHAR);
3451 
3452     bind(SCAN_TO_16_CHAR_INIT);
3453     movdl(vec1, ch);
3454     pxor(vec2, vec2);
3455     pshufb(vec1, vec2);
3456   }
3457 
3458   bind(SCAN_TO_16_CHAR);
3459   cmpl(cnt1, stride);
3460   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3461   if (UseAVX < 2) {
3462     movdl(vec1, ch);
3463     pxor(vec2, vec2);
3464     pshufb(vec1, vec2);
3465   }
3466   movl(tmp, cnt1);
3467   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3468   andl(cnt1,0x0000000F);  //tail count (in bytes)
3469 
3470   bind(SCAN_TO_16_CHAR_LOOP);
3471   movdqu(vec3, Address(result, 0));
3472   pcmpeqb(vec3, vec1);
3473   ptest(vec2, vec3);
3474   jcc(Assembler::carryClear, FOUND_CHAR);
3475   addptr(result, 16);
3476   subl(tmp, stride);
3477   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3478 
3479   bind(SCAN_TO_CHAR_INIT);
3480   testl(cnt1, cnt1);
3481   jcc(Assembler::zero, RET_NOT_FOUND);
3482   bind(SCAN_TO_CHAR_LOOP);
3483   load_unsigned_byte(tmp, Address(result, 0));
3484   cmpl(ch, tmp);
3485   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3486   addptr(result, 1);
3487   subl(cnt1, 1);
3488   jccb(Assembler::zero, RET_NOT_FOUND);
3489   jmp(SCAN_TO_CHAR_LOOP);
3490 
3491   bind(RET_NOT_FOUND);
3492   movl(result, -1);
3493   jmpb(DONE_LABEL);
3494 
3495   bind(FOUND_CHAR);
3496   if (UseAVX >= 2) {
3497     vpmovmskb(tmp, vec3);
3498   } else {
3499     pmovmskb(tmp, vec3);
3500   }
3501   bsfl(ch, tmp);
3502   addptr(result, ch);
3503 
3504   bind(FOUND_SEQ_CHAR);
3505   subptr(result, str1);
3506 
3507   bind(DONE_LABEL);
3508 } // stringL_indexof_char
3509 
3510 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3511   switch (eltype) {
3512   case T_BOOLEAN: return sizeof(jboolean);
3513   case T_BYTE:  return sizeof(jbyte);
3514   case T_SHORT: return sizeof(jshort);
3515   case T_CHAR:  return sizeof(jchar);
3516   case T_INT:   return sizeof(jint);
3517   default:
3518     ShouldNotReachHere();
3519     return -1;
3520   }
3521 }
3522 
3523 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3524   switch (eltype) {
3525   // T_BOOLEAN used as surrogate for unsigned byte
3526   case T_BOOLEAN: movzbl(dst, src);   break;
3527   case T_BYTE:    movsbl(dst, src);   break;
3528   case T_SHORT:   movswl(dst, src);   break;
3529   case T_CHAR:    movzwl(dst, src);   break;
3530   case T_INT:     movl(dst, src);     break;
3531   default:
3532     ShouldNotReachHere();
3533   }
3534 }
3535 
3536 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3537   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3538 }
3539 
3540 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3541   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3542 }
3543 
3544 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3545   const int vlen = Assembler::AVX_256bit;
3546   switch (eltype) {
3547   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3548   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3549   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3550   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3551   case T_INT:
3552     // do nothing
3553     break;
3554   default:
3555     ShouldNotReachHere();
3556   }
3557 }
3558 
3559 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3560                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3561                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3562                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3563                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3564                                         BasicType eltype) {
3565   ShortBranchVerifier sbv(this);
3566   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3567   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3568   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3569 
3570   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3571         SHORT_UNROLLED_LOOP_EXIT,
3572         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3573         UNROLLED_VECTOR_LOOP_BEGIN,
3574         END;
3575   switch (eltype) {
3576   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3577   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3578   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3579   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3580   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3581   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3582   }
3583 
3584   // For "renaming" for readibility of the code
3585   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3586                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3587                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3588 
3589   const int elsize = arrays_hashcode_elsize(eltype);
3590 
3591   /*
3592     if (cnt1 >= 2) {
3593       if (cnt1 >= 32) {
3594         UNROLLED VECTOR LOOP
3595       }
3596       UNROLLED SCALAR LOOP
3597     }
3598     SINGLE SCALAR
3599    */
3600 
3601   cmpl(cnt1, 32);
3602   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3603 
3604   // cnt1 >= 32 && generate_vectorized_loop
3605   xorl(index, index);
3606 
3607   // vresult = IntVector.zero(I256);
3608   for (int idx = 0; idx < 4; idx++) {
3609     vpxor(vresult[idx], vresult[idx]);
3610   }
3611   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3612   Register bound = tmp2;
3613   Register next = tmp3;
3614   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3615   movl(next, Address(tmp2, 0));
3616   movdl(vnext, next);
3617   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3618 
3619   // index = 0;
3620   // bound = cnt1 & ~(32 - 1);
3621   movl(bound, cnt1);
3622   andl(bound, ~(32 - 1));
3623   // for (; index < bound; index += 32) {
3624   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3625   // result *= next;
3626   imull(result, next);
3627   // loop fission to upfront the cost of fetching from memory, OOO execution
3628   // can then hopefully do a better job of prefetching
3629   for (int idx = 0; idx < 4; idx++) {
3630     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3631   }
3632   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3633   for (int idx = 0; idx < 4; idx++) {
3634     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3635     arrays_hashcode_elvcast(vtmp[idx], eltype);
3636     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3637   }
3638   // index += 32;
3639   addl(index, 32);
3640   // index < bound;
3641   cmpl(index, bound);
3642   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3643   // }
3644 
3645   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3646   subl(cnt1, bound);
3647   // release bound
3648 
3649   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3650   for (int idx = 0; idx < 4; idx++) {
3651     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3652     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3653     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3654   }
3655   // result += vresult.reduceLanes(ADD);
3656   for (int idx = 0; idx < 4; idx++) {
3657     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3658   }
3659 
3660   // } else if (cnt1 < 32) {
3661 
3662   bind(SHORT_UNROLLED_BEGIN);
3663   // int i = 1;
3664   movl(index, 1);
3665   cmpl(index, cnt1);
3666   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3667 
3668   // for (; i < cnt1 ; i += 2) {
3669   bind(SHORT_UNROLLED_LOOP_BEGIN);
3670   movl(tmp3, 961);
3671   imull(result, tmp3);
3672   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3673   movl(tmp3, tmp2);
3674   shll(tmp3, 5);
3675   subl(tmp3, tmp2);
3676   addl(result, tmp3);
3677   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3678   addl(result, tmp3);
3679   addl(index, 2);
3680   cmpl(index, cnt1);
3681   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3682 
3683   // }
3684   // if (i >= cnt1) {
3685   bind(SHORT_UNROLLED_LOOP_EXIT);
3686   jccb(Assembler::greater, END);
3687   movl(tmp2, result);
3688   shll(result, 5);
3689   subl(result, tmp2);
3690   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3691   addl(result, tmp3);
3692   // }
3693   bind(END);
3694 
3695   BLOCK_COMMENT("} // arrays_hashcode");
3696 
3697 } // arrays_hashcode
3698 
3699 // helper function for string_compare
3700 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3701                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3702                                            Address::ScaleFactor scale2, Register index, int ae) {
3703   if (ae == StrIntrinsicNode::LL) {
3704     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3705     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3706   } else if (ae == StrIntrinsicNode::UU) {
3707     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3708     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3709   } else {
3710     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3711     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3712   }
3713 }
3714 
3715 // Compare strings, used for char[] and byte[].
3716 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3717                                        Register cnt1, Register cnt2, Register result,
3718                                        XMMRegister vec1, int ae, KRegister mask) {
3719   ShortBranchVerifier sbv(this);
3720   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3721   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3722   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3723   int stride2x2 = 0x40;
3724   Address::ScaleFactor scale = Address::no_scale;
3725   Address::ScaleFactor scale1 = Address::no_scale;
3726   Address::ScaleFactor scale2 = Address::no_scale;
3727 
3728   if (ae != StrIntrinsicNode::LL) {
3729     stride2x2 = 0x20;
3730   }
3731 
3732   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3733     shrl(cnt2, 1);
3734   }
3735   // Compute the minimum of the string lengths and the
3736   // difference of the string lengths (stack).
3737   // Do the conditional move stuff
3738   movl(result, cnt1);
3739   subl(cnt1, cnt2);
3740   push(cnt1);
3741   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3742 
3743   // Is the minimum length zero?
3744   testl(cnt2, cnt2);
3745   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3746   if (ae == StrIntrinsicNode::LL) {
3747     // Load first bytes
3748     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3749     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3750   } else if (ae == StrIntrinsicNode::UU) {
3751     // Load first characters
3752     load_unsigned_short(result, Address(str1, 0));
3753     load_unsigned_short(cnt1, Address(str2, 0));
3754   } else {
3755     load_unsigned_byte(result, Address(str1, 0));
3756     load_unsigned_short(cnt1, Address(str2, 0));
3757   }
3758   subl(result, cnt1);
3759   jcc(Assembler::notZero,  POP_LABEL);
3760 
3761   if (ae == StrIntrinsicNode::UU) {
3762     // Divide length by 2 to get number of chars
3763     shrl(cnt2, 1);
3764   }
3765   cmpl(cnt2, 1);
3766   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3767 
3768   // Check if the strings start at the same location and setup scale and stride
3769   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3770     cmpptr(str1, str2);
3771     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3772     if (ae == StrIntrinsicNode::LL) {
3773       scale = Address::times_1;
3774       stride = 16;
3775     } else {
3776       scale = Address::times_2;
3777       stride = 8;
3778     }
3779   } else {
3780     scale1 = Address::times_1;
3781     scale2 = Address::times_2;
3782     // scale not used
3783     stride = 8;
3784   }
3785 
3786   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3787     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3788     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3789     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3790     Label COMPARE_TAIL_LONG;
3791     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3792 
3793     int pcmpmask = 0x19;
3794     if (ae == StrIntrinsicNode::LL) {
3795       pcmpmask &= ~0x01;
3796     }
3797 
3798     // Setup to compare 16-chars (32-bytes) vectors,
3799     // start from first character again because it has aligned address.
3800     if (ae == StrIntrinsicNode::LL) {
3801       stride2 = 32;
3802     } else {
3803       stride2 = 16;
3804     }
3805     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3806       adr_stride = stride << scale;
3807     } else {
3808       adr_stride1 = 8;  //stride << scale1;
3809       adr_stride2 = 16; //stride << scale2;
3810     }
3811 
3812     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3813     // rax and rdx are used by pcmpestri as elements counters
3814     movl(result, cnt2);
3815     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3816     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3817 
3818     // fast path : compare first 2 8-char vectors.
3819     bind(COMPARE_16_CHARS);
3820     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3821       movdqu(vec1, Address(str1, 0));
3822     } else {
3823       pmovzxbw(vec1, Address(str1, 0));
3824     }
3825     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3826     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3827 
3828     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3829       movdqu(vec1, Address(str1, adr_stride));
3830       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3831     } else {
3832       pmovzxbw(vec1, Address(str1, adr_stride1));
3833       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3834     }
3835     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3836     addl(cnt1, stride);
3837 
3838     // Compare the characters at index in cnt1
3839     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3840     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3841     subl(result, cnt2);
3842     jmp(POP_LABEL);
3843 
3844     // Setup the registers to start vector comparison loop
3845     bind(COMPARE_WIDE_VECTORS);
3846     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3847       lea(str1, Address(str1, result, scale));
3848       lea(str2, Address(str2, result, scale));
3849     } else {
3850       lea(str1, Address(str1, result, scale1));
3851       lea(str2, Address(str2, result, scale2));
3852     }
3853     subl(result, stride2);
3854     subl(cnt2, stride2);
3855     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3856     negptr(result);
3857 
3858     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3859     bind(COMPARE_WIDE_VECTORS_LOOP);
3860 
3861 #ifdef _LP64
3862     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3863       cmpl(cnt2, stride2x2);
3864       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3865       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3866       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3867 
3868       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3869       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3870         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3871         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3872       } else {
3873         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3874         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3875       }
3876       kortestql(mask, mask);
3877       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3878       addptr(result, stride2x2);  // update since we already compared at this addr
3879       subl(cnt2, stride2x2);      // and sub the size too
3880       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3881 
3882       vpxor(vec1, vec1);
3883       jmpb(COMPARE_WIDE_TAIL);
3884     }//if (VM_Version::supports_avx512vlbw())
3885 #endif // _LP64
3886 
3887 
3888     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3889     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3890       vmovdqu(vec1, Address(str1, result, scale));
3891       vpxor(vec1, Address(str2, result, scale));
3892     } else {
3893       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3894       vpxor(vec1, Address(str2, result, scale2));
3895     }
3896     vptest(vec1, vec1);
3897     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3898     addptr(result, stride2);
3899     subl(cnt2, stride2);
3900     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3901     // clean upper bits of YMM registers
3902     vpxor(vec1, vec1);
3903 
3904     // compare wide vectors tail
3905     bind(COMPARE_WIDE_TAIL);
3906     testptr(result, result);
3907     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3908 
3909     movl(result, stride2);
3910     movl(cnt2, result);
3911     negptr(result);
3912     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3913 
3914     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3915     bind(VECTOR_NOT_EQUAL);
3916     // clean upper bits of YMM registers
3917     vpxor(vec1, vec1);
3918     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3919       lea(str1, Address(str1, result, scale));
3920       lea(str2, Address(str2, result, scale));
3921     } else {
3922       lea(str1, Address(str1, result, scale1));
3923       lea(str2, Address(str2, result, scale2));
3924     }
3925     jmp(COMPARE_16_CHARS);
3926 
3927     // Compare tail chars, length between 1 to 15 chars
3928     bind(COMPARE_TAIL_LONG);
3929     movl(cnt2, result);
3930     cmpl(cnt2, stride);
3931     jcc(Assembler::less, COMPARE_SMALL_STR);
3932 
3933     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3934       movdqu(vec1, Address(str1, 0));
3935     } else {
3936       pmovzxbw(vec1, Address(str1, 0));
3937     }
3938     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3939     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3940     subptr(cnt2, stride);
3941     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3942     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3943       lea(str1, Address(str1, result, scale));
3944       lea(str2, Address(str2, result, scale));
3945     } else {
3946       lea(str1, Address(str1, result, scale1));
3947       lea(str2, Address(str2, result, scale2));
3948     }
3949     negptr(cnt2);
3950     jmpb(WHILE_HEAD_LABEL);
3951 
3952     bind(COMPARE_SMALL_STR);
3953   } else if (UseSSE42Intrinsics) {
3954     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3955     int pcmpmask = 0x19;
3956     // Setup to compare 8-char (16-byte) vectors,
3957     // start from first character again because it has aligned address.
3958     movl(result, cnt2);
3959     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3960     if (ae == StrIntrinsicNode::LL) {
3961       pcmpmask &= ~0x01;
3962     }
3963     jcc(Assembler::zero, COMPARE_TAIL);
3964     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3965       lea(str1, Address(str1, result, scale));
3966       lea(str2, Address(str2, result, scale));
3967     } else {
3968       lea(str1, Address(str1, result, scale1));
3969       lea(str2, Address(str2, result, scale2));
3970     }
3971     negptr(result);
3972 
3973     // pcmpestri
3974     //   inputs:
3975     //     vec1- substring
3976     //     rax - negative string length (elements count)
3977     //     mem - scanned string
3978     //     rdx - string length (elements count)
3979     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3980     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3981     //   outputs:
3982     //     rcx - first mismatched element index
3983     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3984 
3985     bind(COMPARE_WIDE_VECTORS);
3986     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3987       movdqu(vec1, Address(str1, result, scale));
3988       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3989     } else {
3990       pmovzxbw(vec1, Address(str1, result, scale1));
3991       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3992     }
3993     // After pcmpestri cnt1(rcx) contains mismatched element index
3994 
3995     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3996     addptr(result, stride);
3997     subptr(cnt2, stride);
3998     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3999 
4000     // compare wide vectors tail
4001     testptr(result, result);
4002     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
4003 
4004     movl(cnt2, stride);
4005     movl(result, stride);
4006     negptr(result);
4007     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4008       movdqu(vec1, Address(str1, result, scale));
4009       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
4010     } else {
4011       pmovzxbw(vec1, Address(str1, result, scale1));
4012       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
4013     }
4014     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
4015 
4016     // Mismatched characters in the vectors
4017     bind(VECTOR_NOT_EQUAL);
4018     addptr(cnt1, result);
4019     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
4020     subl(result, cnt2);
4021     jmpb(POP_LABEL);
4022 
4023     bind(COMPARE_TAIL); // limit is zero
4024     movl(cnt2, result);
4025     // Fallthru to tail compare
4026   }
4027   // Shift str2 and str1 to the end of the arrays, negate min
4028   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4029     lea(str1, Address(str1, cnt2, scale));
4030     lea(str2, Address(str2, cnt2, scale));
4031   } else {
4032     lea(str1, Address(str1, cnt2, scale1));
4033     lea(str2, Address(str2, cnt2, scale2));
4034   }
4035   decrementl(cnt2);  // first character was compared already
4036   negptr(cnt2);
4037 
4038   // Compare the rest of the elements
4039   bind(WHILE_HEAD_LABEL);
4040   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
4041   subl(result, cnt1);
4042   jccb(Assembler::notZero, POP_LABEL);
4043   increment(cnt2);
4044   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
4045 
4046   // Strings are equal up to min length.  Return the length difference.
4047   bind(LENGTH_DIFF_LABEL);
4048   pop(result);
4049   if (ae == StrIntrinsicNode::UU) {
4050     // Divide diff by 2 to get number of chars
4051     sarl(result, 1);
4052   }
4053   jmpb(DONE_LABEL);
4054 
4055 #ifdef _LP64
4056   if (VM_Version::supports_avx512vlbw()) {
4057 
4058     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
4059 
4060     kmovql(cnt1, mask);
4061     notq(cnt1);
4062     bsfq(cnt2, cnt1);
4063     if (ae != StrIntrinsicNode::LL) {
4064       // Divide diff by 2 to get number of chars
4065       sarl(cnt2, 1);
4066     }
4067     addq(result, cnt2);
4068     if (ae == StrIntrinsicNode::LL) {
4069       load_unsigned_byte(cnt1, Address(str2, result));
4070       load_unsigned_byte(result, Address(str1, result));
4071     } else if (ae == StrIntrinsicNode::UU) {
4072       load_unsigned_short(cnt1, Address(str2, result, scale));
4073       load_unsigned_short(result, Address(str1, result, scale));
4074     } else {
4075       load_unsigned_short(cnt1, Address(str2, result, scale2));
4076       load_unsigned_byte(result, Address(str1, result, scale1));
4077     }
4078     subl(result, cnt1);
4079     jmpb(POP_LABEL);
4080   }//if (VM_Version::supports_avx512vlbw())
4081 #endif // _LP64
4082 
4083   // Discard the stored length difference
4084   bind(POP_LABEL);
4085   pop(cnt1);
4086 
4087   // That's it
4088   bind(DONE_LABEL);
4089   if(ae == StrIntrinsicNode::UL) {
4090     negl(result);
4091   }
4092 
4093 }
4094 
4095 // Search for Non-ASCII character (Negative byte value) in a byte array,
4096 // return the index of the first such character, otherwise the length
4097 // of the array segment searched.
4098 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4099 //   @IntrinsicCandidate
4100 //   public static int countPositives(byte[] ba, int off, int len) {
4101 //     for (int i = off; i < off + len; i++) {
4102 //       if (ba[i] < 0) {
4103 //         return i - off;
4104 //       }
4105 //     }
4106 //     return len;
4107 //   }
4108 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4109   Register result, Register tmp1,
4110   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4111   // rsi: byte array
4112   // rcx: len
4113   // rax: result
4114   ShortBranchVerifier sbv(this);
4115   assert_different_registers(ary1, len, result, tmp1);
4116   assert_different_registers(vec1, vec2);
4117   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4118 
4119   movl(result, len); // copy
4120   // len == 0
4121   testl(len, len);
4122   jcc(Assembler::zero, DONE);
4123 
4124   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4125     VM_Version::supports_avx512vlbw() &&
4126     VM_Version::supports_bmi2()) {
4127 
4128     Label test_64_loop, test_tail, BREAK_LOOP;
4129     movl(tmp1, len);
4130     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4131 
4132     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4133     andl(len,  0xffffffc0); // vector count (in chars)
4134     jccb(Assembler::zero, test_tail);
4135 
4136     lea(ary1, Address(ary1, len, Address::times_1));
4137     negptr(len);
4138 
4139     bind(test_64_loop);
4140     // Check whether our 64 elements of size byte contain negatives
4141     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4142     kortestql(mask1, mask1);
4143     jcc(Assembler::notZero, BREAK_LOOP);
4144 
4145     addptr(len, 64);
4146     jccb(Assembler::notZero, test_64_loop);
4147 
4148     bind(test_tail);
4149     // bail out when there is nothing to be done
4150     testl(tmp1, -1);
4151     jcc(Assembler::zero, DONE);
4152 
4153 
4154     // check the tail for absense of negatives
4155     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4156 #ifdef _LP64
4157     {
4158       Register tmp3_aliased = len;
4159       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4160       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4161       notq(tmp3_aliased);
4162       kmovql(mask2, tmp3_aliased);
4163     }
4164 #else
4165     Label k_init;
4166     jmp(k_init);
4167 
4168     // We could not read 64-bits from a general purpose register thus we move
4169     // data required to compose 64 1's to the instruction stream
4170     // We emit 64 byte wide series of elements from 0..63 which later on would
4171     // be used as a compare targets with tail count contained in tmp1 register.
4172     // Result would be a k register having tmp1 consecutive number or 1
4173     // counting from least significant bit.
4174     address tmp = pc();
4175     emit_int64(0x0706050403020100);
4176     emit_int64(0x0F0E0D0C0B0A0908);
4177     emit_int64(0x1716151413121110);
4178     emit_int64(0x1F1E1D1C1B1A1918);
4179     emit_int64(0x2726252423222120);
4180     emit_int64(0x2F2E2D2C2B2A2928);
4181     emit_int64(0x3736353433323130);
4182     emit_int64(0x3F3E3D3C3B3A3938);
4183 
4184     bind(k_init);
4185     lea(len, InternalAddress(tmp));
4186     // create mask to test for negative byte inside a vector
4187     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
4188     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
4189 
4190 #endif
4191     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4192     ktestq(mask1, mask2);
4193     jcc(Assembler::zero, DONE);
4194 
4195     // do a full check for negative registers in the tail
4196     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4197                      // ary1 already pointing to the right place
4198     jmpb(TAIL_START);
4199 
4200     bind(BREAK_LOOP);
4201     // At least one byte in the last 64 byte block was negative.
4202     // Set up to look at the last 64 bytes as if they were a tail
4203     lea(ary1, Address(ary1, len, Address::times_1));
4204     addptr(result, len);
4205     // Ignore the very last byte: if all others are positive,
4206     // it must be negative, so we can skip right to the 2+1 byte
4207     // end comparison at this point
4208     orl(result, 63);
4209     movl(len, 63);
4210     // Fallthru to tail compare
4211   } else {
4212 
4213     if (UseAVX >= 2 && UseSSE >= 2) {
4214       // With AVX2, use 32-byte vector compare
4215       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4216 
4217       // Compare 32-byte vectors
4218       testl(len, 0xffffffe0);   // vector count (in bytes)
4219       jccb(Assembler::zero, TAIL_START);
4220 
4221       andl(len, 0xffffffe0);
4222       lea(ary1, Address(ary1, len, Address::times_1));
4223       negptr(len);
4224 
4225       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4226       movdl(vec2, tmp1);
4227       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4228 
4229       bind(COMPARE_WIDE_VECTORS);
4230       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4231       vptest(vec1, vec2);
4232       jccb(Assembler::notZero, BREAK_LOOP);
4233       addptr(len, 32);
4234       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4235 
4236       testl(result, 0x0000001f);   // any bytes remaining?
4237       jcc(Assembler::zero, DONE);
4238 
4239       // Quick test using the already prepared vector mask
4240       movl(len, result);
4241       andl(len, 0x0000001f);
4242       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4243       vptest(vec1, vec2);
4244       jcc(Assembler::zero, DONE);
4245       // There are zeros, jump to the tail to determine exactly where
4246       jmpb(TAIL_START);
4247 
4248       bind(BREAK_LOOP);
4249       // At least one byte in the last 32-byte vector is negative.
4250       // Set up to look at the last 32 bytes as if they were a tail
4251       lea(ary1, Address(ary1, len, Address::times_1));
4252       addptr(result, len);
4253       // Ignore the very last byte: if all others are positive,
4254       // it must be negative, so we can skip right to the 2+1 byte
4255       // end comparison at this point
4256       orl(result, 31);
4257       movl(len, 31);
4258       // Fallthru to tail compare
4259     } else if (UseSSE42Intrinsics) {
4260       // With SSE4.2, use double quad vector compare
4261       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4262 
4263       // Compare 16-byte vectors
4264       testl(len, 0xfffffff0);   // vector count (in bytes)
4265       jcc(Assembler::zero, TAIL_START);
4266 
4267       andl(len, 0xfffffff0);
4268       lea(ary1, Address(ary1, len, Address::times_1));
4269       negptr(len);
4270 
4271       movl(tmp1, 0x80808080);
4272       movdl(vec2, tmp1);
4273       pshufd(vec2, vec2, 0);
4274 
4275       bind(COMPARE_WIDE_VECTORS);
4276       movdqu(vec1, Address(ary1, len, Address::times_1));
4277       ptest(vec1, vec2);
4278       jccb(Assembler::notZero, BREAK_LOOP);
4279       addptr(len, 16);
4280       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4281 
4282       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4283       jcc(Assembler::zero, DONE);
4284 
4285       // Quick test using the already prepared vector mask
4286       movl(len, result);
4287       andl(len, 0x0000000f);   // tail count (in bytes)
4288       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4289       ptest(vec1, vec2);
4290       jcc(Assembler::zero, DONE);
4291       jmpb(TAIL_START);
4292 
4293       bind(BREAK_LOOP);
4294       // At least one byte in the last 16-byte vector is negative.
4295       // Set up and look at the last 16 bytes as if they were a tail
4296       lea(ary1, Address(ary1, len, Address::times_1));
4297       addptr(result, len);
4298       // Ignore the very last byte: if all others are positive,
4299       // it must be negative, so we can skip right to the 2+1 byte
4300       // end comparison at this point
4301       orl(result, 15);
4302       movl(len, 15);
4303       // Fallthru to tail compare
4304     }
4305   }
4306 
4307   bind(TAIL_START);
4308   // Compare 4-byte vectors
4309   andl(len, 0xfffffffc); // vector count (in bytes)
4310   jccb(Assembler::zero, COMPARE_CHAR);
4311 
4312   lea(ary1, Address(ary1, len, Address::times_1));
4313   negptr(len);
4314 
4315   bind(COMPARE_VECTORS);
4316   movl(tmp1, Address(ary1, len, Address::times_1));
4317   andl(tmp1, 0x80808080);
4318   jccb(Assembler::notZero, TAIL_ADJUST);
4319   addptr(len, 4);
4320   jccb(Assembler::notZero, COMPARE_VECTORS);
4321 
4322   // Compare trailing char (final 2-3 bytes), if any
4323   bind(COMPARE_CHAR);
4324 
4325   testl(result, 0x2);   // tail  char
4326   jccb(Assembler::zero, COMPARE_BYTE);
4327   load_unsigned_short(tmp1, Address(ary1, 0));
4328   andl(tmp1, 0x00008080);
4329   jccb(Assembler::notZero, CHAR_ADJUST);
4330   lea(ary1, Address(ary1, 2));
4331 
4332   bind(COMPARE_BYTE);
4333   testl(result, 0x1);   // tail  byte
4334   jccb(Assembler::zero, DONE);
4335   load_unsigned_byte(tmp1, Address(ary1, 0));
4336   testl(tmp1, 0x00000080);
4337   jccb(Assembler::zero, DONE);
4338   subptr(result, 1);
4339   jmpb(DONE);
4340 
4341   bind(TAIL_ADJUST);
4342   // there are negative bits in the last 4 byte block.
4343   // Adjust result and check the next three bytes
4344   addptr(result, len);
4345   orl(result, 3);
4346   lea(ary1, Address(ary1, len, Address::times_1));
4347   jmpb(COMPARE_CHAR);
4348 
4349   bind(CHAR_ADJUST);
4350   // We are looking at a char + optional byte tail, and found that one
4351   // of the bytes in the char is negative. Adjust the result, check the
4352   // first byte and readjust if needed.
4353   andl(result, 0xfffffffc);
4354   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4355   jccb(Assembler::notZero, DONE);
4356   addptr(result, 1);
4357 
4358   // That's it
4359   bind(DONE);
4360   if (UseAVX >= 2 && UseSSE >= 2) {
4361     // clean upper bits of YMM registers
4362     vpxor(vec1, vec1);
4363     vpxor(vec2, vec2);
4364   }
4365 }
4366 
4367 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4368 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4369                                       Register limit, Register result, Register chr,
4370                                       XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
4371   ShortBranchVerifier sbv(this);
4372   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4373 
4374   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4375   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4376 
4377   if (is_array_equ) {
4378     // Check the input args
4379     cmpoop(ary1, ary2);
4380     jcc(Assembler::equal, TRUE_LABEL);
4381 
4382     // Need additional checks for arrays_equals.
4383     testptr(ary1, ary1);
4384     jcc(Assembler::zero, FALSE_LABEL);
4385     testptr(ary2, ary2);
4386     jcc(Assembler::zero, FALSE_LABEL);
4387 
4388     // Check the lengths
4389     movl(limit, Address(ary1, length_offset));
4390     cmpl(limit, Address(ary2, length_offset));
4391     jcc(Assembler::notEqual, FALSE_LABEL);
4392   }
4393 
4394   // count == 0
4395   testl(limit, limit);
4396   jcc(Assembler::zero, TRUE_LABEL);
4397 
4398   if (is_array_equ) {
4399     // Load array address
4400     lea(ary1, Address(ary1, base_offset));
4401     lea(ary2, Address(ary2, base_offset));
4402   }
4403 
4404   if (is_array_equ && is_char) {
4405     // arrays_equals when used for char[].
4406     shll(limit, 1);      // byte count != 0
4407   }
4408   movl(result, limit); // copy
4409 
4410   if (UseAVX >= 2) {
4411     // With AVX2, use 32-byte vector compare
4412     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4413 
4414     // Compare 32-byte vectors
4415     andl(result, 0x0000001f);  //   tail count (in bytes)
4416     andl(limit, 0xffffffe0);   // vector count (in bytes)
4417     jcc(Assembler::zero, COMPARE_TAIL);
4418 
4419     lea(ary1, Address(ary1, limit, Address::times_1));
4420     lea(ary2, Address(ary2, limit, Address::times_1));
4421     negptr(limit);
4422 
4423 #ifdef _LP64
4424     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4425       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4426 
4427       cmpl(limit, -64);
4428       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4429 
4430       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4431 
4432       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4433       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4434       kortestql(mask, mask);
4435       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4436       addptr(limit, 64);  // update since we already compared at this addr
4437       cmpl(limit, -64);
4438       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4439 
4440       // At this point we may still need to compare -limit+result bytes.
4441       // We could execute the next two instruction and just continue via non-wide path:
4442       //  cmpl(limit, 0);
4443       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4444       // But since we stopped at the points ary{1,2}+limit which are
4445       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4446       // (|limit| <= 32 and result < 32),
4447       // we may just compare the last 64 bytes.
4448       //
4449       addptr(result, -64);   // it is safe, bc we just came from this area
4450       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4451       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4452       kortestql(mask, mask);
4453       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4454 
4455       jmp(TRUE_LABEL);
4456 
4457       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4458 
4459     }//if (VM_Version::supports_avx512vlbw())
4460 #endif //_LP64
4461     bind(COMPARE_WIDE_VECTORS);
4462     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
4463     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4464     vpxor(vec1, vec2);
4465 
4466     vptest(vec1, vec1);
4467     jcc(Assembler::notZero, FALSE_LABEL);
4468     addptr(limit, 32);
4469     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4470 
4471     testl(result, result);
4472     jcc(Assembler::zero, TRUE_LABEL);
4473 
4474     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
4475     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4476     vpxor(vec1, vec2);
4477 
4478     vptest(vec1, vec1);
4479     jccb(Assembler::notZero, FALSE_LABEL);
4480     jmpb(TRUE_LABEL);
4481 
4482     bind(COMPARE_TAIL); // limit is zero
4483     movl(limit, result);
4484     // Fallthru to tail compare
4485   } else if (UseSSE42Intrinsics) {
4486     // With SSE4.2, use double quad vector compare
4487     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4488 
4489     // Compare 16-byte vectors
4490     andl(result, 0x0000000f);  //   tail count (in bytes)
4491     andl(limit, 0xfffffff0);   // vector count (in bytes)
4492     jcc(Assembler::zero, COMPARE_TAIL);
4493 
4494     lea(ary1, Address(ary1, limit, Address::times_1));
4495     lea(ary2, Address(ary2, limit, Address::times_1));
4496     negptr(limit);
4497 
4498     bind(COMPARE_WIDE_VECTORS);
4499     movdqu(vec1, Address(ary1, limit, Address::times_1));
4500     movdqu(vec2, Address(ary2, limit, Address::times_1));
4501     pxor(vec1, vec2);
4502 
4503     ptest(vec1, vec1);
4504     jcc(Assembler::notZero, FALSE_LABEL);
4505     addptr(limit, 16);
4506     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4507 
4508     testl(result, result);
4509     jcc(Assembler::zero, TRUE_LABEL);
4510 
4511     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4512     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4513     pxor(vec1, vec2);
4514 
4515     ptest(vec1, vec1);
4516     jccb(Assembler::notZero, FALSE_LABEL);
4517     jmpb(TRUE_LABEL);
4518 
4519     bind(COMPARE_TAIL); // limit is zero
4520     movl(limit, result);
4521     // Fallthru to tail compare
4522   }
4523 
4524   // Compare 4-byte vectors
4525   andl(limit, 0xfffffffc); // vector count (in bytes)
4526   jccb(Assembler::zero, COMPARE_CHAR);
4527 
4528   lea(ary1, Address(ary1, limit, Address::times_1));
4529   lea(ary2, Address(ary2, limit, Address::times_1));
4530   negptr(limit);
4531 
4532   bind(COMPARE_VECTORS);
4533   movl(chr, Address(ary1, limit, Address::times_1));
4534   cmpl(chr, Address(ary2, limit, Address::times_1));
4535   jccb(Assembler::notEqual, FALSE_LABEL);
4536   addptr(limit, 4);
4537   jcc(Assembler::notZero, COMPARE_VECTORS);
4538 
4539   // Compare trailing char (final 2 bytes), if any
4540   bind(COMPARE_CHAR);
4541   testl(result, 0x2);   // tail  char
4542   jccb(Assembler::zero, COMPARE_BYTE);
4543   load_unsigned_short(chr, Address(ary1, 0));
4544   load_unsigned_short(limit, Address(ary2, 0));
4545   cmpl(chr, limit);
4546   jccb(Assembler::notEqual, FALSE_LABEL);
4547 
4548   if (is_array_equ && is_char) {
4549     bind(COMPARE_BYTE);
4550   } else {
4551     lea(ary1, Address(ary1, 2));
4552     lea(ary2, Address(ary2, 2));
4553 
4554     bind(COMPARE_BYTE);
4555     testl(result, 0x1);   // tail  byte
4556     jccb(Assembler::zero, TRUE_LABEL);
4557     load_unsigned_byte(chr, Address(ary1, 0));
4558     load_unsigned_byte(limit, Address(ary2, 0));
4559     cmpl(chr, limit);
4560     jccb(Assembler::notEqual, FALSE_LABEL);
4561   }
4562   bind(TRUE_LABEL);
4563   movl(result, 1);   // return true
4564   jmpb(DONE);
4565 
4566   bind(FALSE_LABEL);
4567   xorl(result, result); // return false
4568 
4569   // That's it
4570   bind(DONE);
4571   if (UseAVX >= 2) {
4572     // clean upper bits of YMM registers
4573     vpxor(vec1, vec1);
4574     vpxor(vec2, vec2);
4575   }
4576 }
4577 
4578 #ifdef _LP64
4579 
4580 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4581 #define __ masm.
4582   Register dst = stub.data<0>();
4583   XMMRegister src = stub.data<1>();
4584   address target = stub.data<2>();
4585   __ bind(stub.entry());
4586   __ subptr(rsp, 8);
4587   __ movdbl(Address(rsp), src);
4588   __ call(RuntimeAddress(target));
4589   __ pop(dst);
4590   __ jmp(stub.continuation());
4591 #undef __
4592 }
4593 
4594 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4595   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4596   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4597 
4598   address slowpath_target;
4599   if (dst_bt == T_INT) {
4600     if (src_bt == T_FLOAT) {
4601       cvttss2sil(dst, src);
4602       cmpl(dst, 0x80000000);
4603       slowpath_target = StubRoutines::x86::f2i_fixup();
4604     } else {
4605       cvttsd2sil(dst, src);
4606       cmpl(dst, 0x80000000);
4607       slowpath_target = StubRoutines::x86::d2i_fixup();
4608     }
4609   } else {
4610     if (src_bt == T_FLOAT) {
4611       cvttss2siq(dst, src);
4612       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4613       slowpath_target = StubRoutines::x86::f2l_fixup();
4614     } else {
4615       cvttsd2siq(dst, src);
4616       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4617       slowpath_target = StubRoutines::x86::d2l_fixup();
4618     }
4619   }
4620 
4621   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4622   jcc(Assembler::equal, stub->entry());
4623   bind(stub->continuation());
4624 }
4625 
4626 #endif // _LP64
4627 
4628 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4629                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4630   switch(ideal_opc) {
4631     case Op_LShiftVS:
4632       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4633     case Op_LShiftVI:
4634       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4635     case Op_LShiftVL:
4636       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4637     case Op_RShiftVS:
4638       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4639     case Op_RShiftVI:
4640       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4641     case Op_RShiftVL:
4642       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4643     case Op_URShiftVS:
4644       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4645     case Op_URShiftVI:
4646       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4647     case Op_URShiftVL:
4648       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4649     case Op_RotateRightV:
4650       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4651     case Op_RotateLeftV:
4652       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4653     default:
4654       fatal("Unsupported masked operation"); break;
4655   }
4656 }
4657 
4658 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4659                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4660                                     bool is_varshift) {
4661   switch (ideal_opc) {
4662     case Op_AddVB:
4663       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4664     case Op_AddVS:
4665       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4666     case Op_AddVI:
4667       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4668     case Op_AddVL:
4669       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4670     case Op_AddVF:
4671       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4672     case Op_AddVD:
4673       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4674     case Op_SubVB:
4675       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4676     case Op_SubVS:
4677       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4678     case Op_SubVI:
4679       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4680     case Op_SubVL:
4681       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4682     case Op_SubVF:
4683       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4684     case Op_SubVD:
4685       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4686     case Op_MulVS:
4687       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4688     case Op_MulVI:
4689       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4690     case Op_MulVL:
4691       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4692     case Op_MulVF:
4693       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4694     case Op_MulVD:
4695       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4696     case Op_DivVF:
4697       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4698     case Op_DivVD:
4699       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4700     case Op_SqrtVF:
4701       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4702     case Op_SqrtVD:
4703       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4704     case Op_AbsVB:
4705       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4706     case Op_AbsVS:
4707       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4708     case Op_AbsVI:
4709       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4710     case Op_AbsVL:
4711       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4712     case Op_FmaVF:
4713       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4714     case Op_FmaVD:
4715       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4716     case Op_VectorRearrange:
4717       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4718     case Op_LShiftVS:
4719       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4720     case Op_LShiftVI:
4721       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4722     case Op_LShiftVL:
4723       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4724     case Op_RShiftVS:
4725       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4726     case Op_RShiftVI:
4727       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4728     case Op_RShiftVL:
4729       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4730     case Op_URShiftVS:
4731       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4732     case Op_URShiftVI:
4733       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4734     case Op_URShiftVL:
4735       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4736     case Op_RotateLeftV:
4737       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4738     case Op_RotateRightV:
4739       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4740     case Op_MaxV:
4741       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4742     case Op_MinV:
4743       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4744     case Op_XorV:
4745       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4746     case Op_OrV:
4747       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4748     case Op_AndV:
4749       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4750     default:
4751       fatal("Unsupported masked operation"); break;
4752   }
4753 }
4754 
4755 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4756                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4757   switch (ideal_opc) {
4758     case Op_AddVB:
4759       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4760     case Op_AddVS:
4761       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4762     case Op_AddVI:
4763       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4764     case Op_AddVL:
4765       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4766     case Op_AddVF:
4767       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4768     case Op_AddVD:
4769       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4770     case Op_SubVB:
4771       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4772     case Op_SubVS:
4773       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4774     case Op_SubVI:
4775       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4776     case Op_SubVL:
4777       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4778     case Op_SubVF:
4779       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4780     case Op_SubVD:
4781       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4782     case Op_MulVS:
4783       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4784     case Op_MulVI:
4785       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4786     case Op_MulVL:
4787       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4788     case Op_MulVF:
4789       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4790     case Op_MulVD:
4791       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4792     case Op_DivVF:
4793       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4794     case Op_DivVD:
4795       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4796     case Op_FmaVF:
4797       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4798     case Op_FmaVD:
4799       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4800     case Op_MaxV:
4801       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4802     case Op_MinV:
4803       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4804     case Op_XorV:
4805       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4806     case Op_OrV:
4807       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4808     case Op_AndV:
4809       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4810     default:
4811       fatal("Unsupported masked operation"); break;
4812   }
4813 }
4814 
4815 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4816                                   KRegister src1, KRegister src2) {
4817   BasicType etype = T_ILLEGAL;
4818   switch(mask_len) {
4819     case 2:
4820     case 4:
4821     case 8:  etype = T_BYTE; break;
4822     case 16: etype = T_SHORT; break;
4823     case 32: etype = T_INT; break;
4824     case 64: etype = T_LONG; break;
4825     default: fatal("Unsupported type"); break;
4826   }
4827   assert(etype != T_ILLEGAL, "");
4828   switch(ideal_opc) {
4829     case Op_AndVMask:
4830       kand(etype, dst, src1, src2); break;
4831     case Op_OrVMask:
4832       kor(etype, dst, src1, src2); break;
4833     case Op_XorVMask:
4834       kxor(etype, dst, src1, src2); break;
4835     default:
4836       fatal("Unsupported masked operation"); break;
4837   }
4838 }
4839 
4840 /*
4841  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4842  * If src is NaN, the result is 0.
4843  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4844  * the result is equal to the value of Integer.MIN_VALUE.
4845  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4846  * the result is equal to the value of Integer.MAX_VALUE.
4847  */
4848 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4849                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4850                                                                    Register rscratch, AddressLiteral float_sign_flip,
4851                                                                    int vec_enc) {
4852   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4853   Label done;
4854   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4855   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4856   vptest(xtmp2, xtmp2, vec_enc);
4857   jccb(Assembler::equal, done);
4858 
4859   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4860   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4861 
4862   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4863   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4864   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4865 
4866   // Recompute the mask for remaining special value.
4867   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4868   // Extract SRC values corresponding to TRUE mask lanes.
4869   vpand(xtmp4, xtmp2, src, vec_enc);
4870   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4871   // values are set.
4872   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4873 
4874   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4875   bind(done);
4876 }
4877 
4878 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4879                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4880                                                                     Register rscratch, AddressLiteral float_sign_flip,
4881                                                                     int vec_enc) {
4882   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4883   Label done;
4884   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4885   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4886   kortestwl(ktmp1, ktmp1);
4887   jccb(Assembler::equal, done);
4888 
4889   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4890   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4891   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4892 
4893   kxorwl(ktmp1, ktmp1, ktmp2);
4894   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4895   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4896   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4897   bind(done);
4898 }
4899 
4900 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4901                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4902                                                                      Register rscratch, AddressLiteral double_sign_flip,
4903                                                                      int vec_enc) {
4904   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4905 
4906   Label done;
4907   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4908   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4909   kortestwl(ktmp1, ktmp1);
4910   jccb(Assembler::equal, done);
4911 
4912   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4913   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4914   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4915 
4916   kxorwl(ktmp1, ktmp1, ktmp2);
4917   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4918   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4919   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4920   bind(done);
4921 }
4922 
4923 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4924                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4925                                                                      Register rscratch, AddressLiteral float_sign_flip,
4926                                                                      int vec_enc) {
4927   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4928   Label done;
4929   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4930   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4931   kortestwl(ktmp1, ktmp1);
4932   jccb(Assembler::equal, done);
4933 
4934   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4935   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4936   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4937 
4938   kxorwl(ktmp1, ktmp1, ktmp2);
4939   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4940   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4941   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4942   bind(done);
4943 }
4944 
4945 /*
4946  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4947  * If src is NaN, the result is 0.
4948  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4949  * the result is equal to the value of Long.MIN_VALUE.
4950  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4951  * the result is equal to the value of Long.MAX_VALUE.
4952  */
4953 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4954                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4955                                                                       Register rscratch, AddressLiteral double_sign_flip,
4956                                                                       int vec_enc) {
4957   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4958 
4959   Label done;
4960   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4961   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4962   kortestwl(ktmp1, ktmp1);
4963   jccb(Assembler::equal, done);
4964 
4965   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4966   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4967   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4968 
4969   kxorwl(ktmp1, ktmp1, ktmp2);
4970   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4971   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4972   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4973   bind(done);
4974 }
4975 
4976 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
4977                                                              XMMRegister xtmp, int index, int vec_enc) {
4978    assert(vec_enc < Assembler::AVX_512bit, "");
4979    if (vec_enc == Assembler::AVX_256bit) {
4980      vextractf128_high(xtmp, src);
4981      vshufps(dst, src, xtmp, index, vec_enc);
4982    } else {
4983      vshufps(dst, src, zero, index, vec_enc);
4984    }
4985 }
4986 
4987 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4988                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
4989                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
4990   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4991 
4992   Label done;
4993   // Compare the destination lanes with float_sign_flip
4994   // value to get mask for all special values.
4995   movdqu(xtmp1, float_sign_flip, rscratch);
4996   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
4997   ptest(xtmp2, xtmp2);
4998   jccb(Assembler::equal, done);
4999 
5000   // Flip float_sign_flip to get max integer value.
5001   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5002   pxor(xtmp1, xtmp4);
5003 
5004   // Set detination lanes corresponding to unordered source lanes as zero.
5005   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5006   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5007 
5008   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5009   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5010   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5011 
5012   // Recompute the mask for remaining special value.
5013   pxor(xtmp2, xtmp3);
5014   // Extract mask corresponding to non-negative source lanes.
5015   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5016 
5017   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5018   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5019   pand(xtmp3, xtmp2);
5020 
5021   // Replace destination lanes holding special value(0x80000000) with max int
5022   // if corresponding source lane holds a +ve value.
5023   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5024   bind(done);
5025 }
5026 
5027 
5028 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5029                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5030   switch(to_elem_bt) {
5031     case T_SHORT:
5032       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5033       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5034       vpackusdw(dst, dst, zero, vec_enc);
5035       if (vec_enc == Assembler::AVX_256bit) {
5036         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5037       }
5038       break;
5039     case  T_BYTE:
5040       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5041       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5042       vpackusdw(dst, dst, zero, vec_enc);
5043       if (vec_enc == Assembler::AVX_256bit) {
5044         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5045       }
5046       vpackuswb(dst, dst, zero, vec_enc);
5047       break;
5048     default: assert(false, "%s", type2name(to_elem_bt));
5049   }
5050 }
5051 
5052 /*
5053  * Algorithm for vector D2L and F2I conversions:-
5054  * a) Perform vector D2L/F2I cast.
5055  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5056  *    It signifies that source value could be any of the special floating point
5057  *    values(NaN,-Inf,Inf,Max,-Min).
5058  * c) Set destination to zero if source is NaN value.
5059  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5060  */
5061 
5062 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5063                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5064                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5065   int to_elem_sz = type2aelembytes(to_elem_bt);
5066   assert(to_elem_sz <= 4, "");
5067   vcvttps2dq(dst, src, vec_enc);
5068   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5069   if (to_elem_sz < 4) {
5070     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5071     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5072   }
5073 }
5074 
5075 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5076                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5077                                             Register rscratch, int vec_enc) {
5078   int to_elem_sz = type2aelembytes(to_elem_bt);
5079   assert(to_elem_sz <= 4, "");
5080   vcvttps2dq(dst, src, vec_enc);
5081   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5082   switch(to_elem_bt) {
5083     case T_INT:
5084       break;
5085     case T_SHORT:
5086       evpmovdw(dst, dst, vec_enc);
5087       break;
5088     case T_BYTE:
5089       evpmovdb(dst, dst, vec_enc);
5090       break;
5091     default: assert(false, "%s", type2name(to_elem_bt));
5092   }
5093 }
5094 
5095 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5096                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5097                                             Register rscratch, int vec_enc) {
5098   evcvttps2qq(dst, src, vec_enc);
5099   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5100 }
5101 
5102 // Handling for downcasting from double to integer or sub-word types on AVX2.
5103 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5104                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5105                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5106   int to_elem_sz = type2aelembytes(to_elem_bt);
5107   assert(to_elem_sz < 8, "");
5108   vcvttpd2dq(dst, src, vec_enc);
5109   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5110                                               float_sign_flip, vec_enc);
5111   if (to_elem_sz < 4) {
5112     // xtmp4 holds all zero lanes.
5113     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5114   }
5115 }
5116 
5117 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5118                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5119                                             KRegister ktmp2, AddressLiteral sign_flip,
5120                                             Register rscratch, int vec_enc) {
5121   if (VM_Version::supports_avx512dq()) {
5122     evcvttpd2qq(dst, src, vec_enc);
5123     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5124     switch(to_elem_bt) {
5125       case T_LONG:
5126         break;
5127       case T_INT:
5128         evpmovsqd(dst, dst, vec_enc);
5129         break;
5130       case T_SHORT:
5131         evpmovsqd(dst, dst, vec_enc);
5132         evpmovdw(dst, dst, vec_enc);
5133         break;
5134       case T_BYTE:
5135         evpmovsqd(dst, dst, vec_enc);
5136         evpmovdb(dst, dst, vec_enc);
5137         break;
5138       default: assert(false, "%s", type2name(to_elem_bt));
5139     }
5140   } else {
5141     assert(type2aelembytes(to_elem_bt) <= 4, "");
5142     vcvttpd2dq(dst, src, vec_enc);
5143     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5144     switch(to_elem_bt) {
5145       case T_INT:
5146         break;
5147       case T_SHORT:
5148         evpmovdw(dst, dst, vec_enc);
5149         break;
5150       case T_BYTE:
5151         evpmovdb(dst, dst, vec_enc);
5152         break;
5153       default: assert(false, "%s", type2name(to_elem_bt));
5154     }
5155   }
5156 }
5157 
5158 #ifdef _LP64
5159 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5160                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5161                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5162   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5163   // and re-instantiate original MXCSR.RC mode after that.
5164   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5165 
5166   mov64(tmp, julong_cast(0.5L));
5167   evpbroadcastq(xtmp1, tmp, vec_enc);
5168   vaddpd(xtmp1, src , xtmp1, vec_enc);
5169   evcvtpd2qq(dst, xtmp1, vec_enc);
5170   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5171                                                 double_sign_flip, vec_enc);;
5172 
5173   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5174 }
5175 
5176 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5177                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5178                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5179   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5180   // and re-instantiate original MXCSR.RC mode after that.
5181   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5182 
5183   movl(tmp, jint_cast(0.5));
5184   movq(xtmp1, tmp);
5185   vbroadcastss(xtmp1, xtmp1, vec_enc);
5186   vaddps(xtmp1, src , xtmp1, vec_enc);
5187   vcvtps2dq(dst, xtmp1, vec_enc);
5188   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5189                                               float_sign_flip, vec_enc);
5190 
5191   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5192 }
5193 
5194 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5195                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5196                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5197   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5198   // and re-instantiate original MXCSR.RC mode after that.
5199   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5200 
5201   movl(tmp, jint_cast(0.5));
5202   movq(xtmp1, tmp);
5203   vbroadcastss(xtmp1, xtmp1, vec_enc);
5204   vaddps(xtmp1, src , xtmp1, vec_enc);
5205   vcvtps2dq(dst, xtmp1, vec_enc);
5206   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5207 
5208   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5209 }
5210 #endif // _LP64
5211 
5212 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5213                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5214   switch (from_elem_bt) {
5215     case T_BYTE:
5216       switch (to_elem_bt) {
5217         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5218         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5219         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5220         default: ShouldNotReachHere();
5221       }
5222       break;
5223     case T_SHORT:
5224       switch (to_elem_bt) {
5225         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5226         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5227         default: ShouldNotReachHere();
5228       }
5229       break;
5230     case T_INT:
5231       assert(to_elem_bt == T_LONG, "");
5232       vpmovzxdq(dst, src, vlen_enc);
5233       break;
5234     default:
5235       ShouldNotReachHere();
5236   }
5237 }
5238 
5239 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5240                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5241   switch (from_elem_bt) {
5242     case T_BYTE:
5243       switch (to_elem_bt) {
5244         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5245         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5246         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5247         default: ShouldNotReachHere();
5248       }
5249       break;
5250     case T_SHORT:
5251       switch (to_elem_bt) {
5252         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5253         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5254         default: ShouldNotReachHere();
5255       }
5256       break;
5257     case T_INT:
5258       assert(to_elem_bt == T_LONG, "");
5259       vpmovsxdq(dst, src, vlen_enc);
5260       break;
5261     default:
5262       ShouldNotReachHere();
5263   }
5264 }
5265 
5266 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5267                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5268   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5269   assert(vlen_enc != AVX_512bit, "");
5270 
5271   int dst_bt_size = type2aelembytes(dst_bt);
5272   int src_bt_size = type2aelembytes(src_bt);
5273   if (dst_bt_size > src_bt_size) {
5274     switch (dst_bt_size / src_bt_size) {
5275       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5276       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5277       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5278       default: ShouldNotReachHere();
5279     }
5280   } else {
5281     assert(dst_bt_size < src_bt_size, "");
5282     switch (src_bt_size / dst_bt_size) {
5283       case 2: {
5284         if (vlen_enc == AVX_128bit) {
5285           vpacksswb(dst, src, src, vlen_enc);
5286         } else {
5287           vpacksswb(dst, src, src, vlen_enc);
5288           vpermq(dst, dst, 0x08, vlen_enc);
5289         }
5290         break;
5291       }
5292       case 4: {
5293         if (vlen_enc == AVX_128bit) {
5294           vpackssdw(dst, src, src, vlen_enc);
5295           vpacksswb(dst, dst, dst, vlen_enc);
5296         } else {
5297           vpackssdw(dst, src, src, vlen_enc);
5298           vpermq(dst, dst, 0x08, vlen_enc);
5299           vpacksswb(dst, dst, dst, AVX_128bit);
5300         }
5301         break;
5302       }
5303       case 8: {
5304         if (vlen_enc == AVX_128bit) {
5305           vpshufd(dst, src, 0x08, vlen_enc);
5306           vpackssdw(dst, dst, dst, vlen_enc);
5307           vpacksswb(dst, dst, dst, vlen_enc);
5308         } else {
5309           vpshufd(dst, src, 0x08, vlen_enc);
5310           vpermq(dst, dst, 0x08, vlen_enc);
5311           vpackssdw(dst, dst, dst, AVX_128bit);
5312           vpacksswb(dst, dst, dst, AVX_128bit);
5313         }
5314         break;
5315       }
5316       default: ShouldNotReachHere();
5317     }
5318   }
5319 }
5320 
5321 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5322                                    bool merge, BasicType bt, int vlen_enc) {
5323   if (bt == T_INT) {
5324     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5325   } else {
5326     assert(bt == T_LONG, "");
5327     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5328   }
5329 }
5330 
5331 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5332                                    bool merge, BasicType bt, int vlen_enc) {
5333   if (bt == T_INT) {
5334     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5335   } else {
5336     assert(bt == T_LONG, "");
5337     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5338   }
5339 }
5340 
5341 #ifdef _LP64
5342 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5343                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5344                                                int vec_enc) {
5345   int index = 0;
5346   int vindex = 0;
5347   mov64(rtmp1, 0x0101010101010101L);
5348   pdepq(rtmp1, src, rtmp1);
5349   if (mask_len > 8) {
5350     movq(rtmp2, src);
5351     vpxor(xtmp, xtmp, xtmp, vec_enc);
5352     movq(xtmp, rtmp1);
5353   }
5354   movq(dst, rtmp1);
5355 
5356   mask_len -= 8;
5357   while (mask_len > 0) {
5358     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5359     index++;
5360     if ((index % 2) == 0) {
5361       pxor(xtmp, xtmp);
5362     }
5363     mov64(rtmp1, 0x0101010101010101L);
5364     shrq(rtmp2, 8);
5365     pdepq(rtmp1, rtmp2, rtmp1);
5366     pinsrq(xtmp, rtmp1, index % 2);
5367     vindex = index / 2;
5368     if (vindex) {
5369       // Write entire 16 byte vector when both 64 bit
5370       // lanes are update to save redundant instructions.
5371       if (index % 2) {
5372         vinsertf128(dst, dst, xtmp, vindex);
5373       }
5374     } else {
5375       vmovdqu(dst, xtmp);
5376     }
5377     mask_len -= 8;
5378   }
5379 }
5380 
5381 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5382   switch(opc) {
5383     case Op_VectorMaskTrueCount:
5384       popcntq(dst, tmp);
5385       break;
5386     case Op_VectorMaskLastTrue:
5387       if (VM_Version::supports_lzcnt()) {
5388         lzcntq(tmp, tmp);
5389         movl(dst, 63);
5390         subl(dst, tmp);
5391       } else {
5392         movl(dst, -1);
5393         bsrq(tmp, tmp);
5394         cmov32(Assembler::notZero, dst, tmp);
5395       }
5396       break;
5397     case Op_VectorMaskFirstTrue:
5398       if (VM_Version::supports_bmi1()) {
5399         if (masklen < 32) {
5400           orl(tmp, 1 << masklen);
5401           tzcntl(dst, tmp);
5402         } else if (masklen == 32) {
5403           tzcntl(dst, tmp);
5404         } else {
5405           assert(masklen == 64, "");
5406           tzcntq(dst, tmp);
5407         }
5408       } else {
5409         if (masklen < 32) {
5410           orl(tmp, 1 << masklen);
5411           bsfl(dst, tmp);
5412         } else {
5413           assert(masklen == 32 || masklen == 64, "");
5414           movl(dst, masklen);
5415           if (masklen == 32)  {
5416             bsfl(tmp, tmp);
5417           } else {
5418             bsfq(tmp, tmp);
5419           }
5420           cmov32(Assembler::notZero, dst, tmp);
5421         }
5422       }
5423       break;
5424     case Op_VectorMaskToLong:
5425       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5426       break;
5427     default: assert(false, "Unhandled mask operation");
5428   }
5429 }
5430 
5431 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5432                                               int masklen, int masksize, int vec_enc) {
5433   assert(VM_Version::supports_popcnt(), "");
5434 
5435   if(VM_Version::supports_avx512bw()) {
5436     kmovql(tmp, mask);
5437   } else {
5438     assert(masklen <= 16, "");
5439     kmovwl(tmp, mask);
5440   }
5441 
5442   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5443   // operations needs to be clipped.
5444   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5445     andq(tmp, (1 << masklen) - 1);
5446   }
5447 
5448   vector_mask_operation_helper(opc, dst, tmp, masklen);
5449 }
5450 
5451 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5452                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5453   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5454          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5455   assert(VM_Version::supports_popcnt(), "");
5456 
5457   bool need_clip = false;
5458   switch(bt) {
5459     case T_BOOLEAN:
5460       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5461       vpxor(xtmp, xtmp, xtmp, vec_enc);
5462       vpsubb(xtmp, xtmp, mask, vec_enc);
5463       vpmovmskb(tmp, xtmp, vec_enc);
5464       need_clip = masklen < 16;
5465       break;
5466     case T_BYTE:
5467       vpmovmskb(tmp, mask, vec_enc);
5468       need_clip = masklen < 16;
5469       break;
5470     case T_SHORT:
5471       vpacksswb(xtmp, mask, mask, vec_enc);
5472       if (masklen >= 16) {
5473         vpermpd(xtmp, xtmp, 8, vec_enc);
5474       }
5475       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5476       need_clip = masklen < 16;
5477       break;
5478     case T_INT:
5479     case T_FLOAT:
5480       vmovmskps(tmp, mask, vec_enc);
5481       need_clip = masklen < 4;
5482       break;
5483     case T_LONG:
5484     case T_DOUBLE:
5485       vmovmskpd(tmp, mask, vec_enc);
5486       need_clip = masklen < 2;
5487       break;
5488     default: assert(false, "Unhandled type, %s", type2name(bt));
5489   }
5490 
5491   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5492   // operations needs to be clipped.
5493   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5494     // need_clip implies masklen < 32
5495     andq(tmp, (1 << masklen) - 1);
5496   }
5497 
5498   vector_mask_operation_helper(opc, dst, tmp, masklen);
5499 }
5500 
5501 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5502                                              Register rtmp2, int mask_len) {
5503   kmov(rtmp1, src);
5504   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5505   mov64(rtmp2, -1L);
5506   pextq(rtmp2, rtmp2, rtmp1);
5507   kmov(dst, rtmp2);
5508 }
5509 
5510 #ifdef _LP64
5511 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5512                                                     XMMRegister mask, Register rtmp, Register rscratch,
5513                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5514                                                     int vec_enc) {
5515   assert(type2aelembytes(bt) >= 4, "");
5516   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5517   address compress_perm_table = nullptr;
5518   address expand_perm_table = nullptr;
5519   if (type2aelembytes(bt) == 8) {
5520     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5521     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5522     vmovmskpd(rtmp, mask, vec_enc);
5523   } else {
5524     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5525     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5526     vmovmskps(rtmp, mask, vec_enc);
5527   }
5528   shlq(rtmp, 5); // for 32 byte permute row.
5529   if (opcode == Op_CompressV) {
5530     lea(rscratch, ExternalAddress(compress_perm_table));
5531   } else {
5532     lea(rscratch, ExternalAddress(expand_perm_table));
5533   }
5534   addptr(rtmp, rscratch);
5535   vmovdqu(permv, Address(rtmp));
5536   vpermps(dst, permv, src, Assembler::AVX_256bit);
5537   vpxor(xtmp, xtmp, xtmp, vec_enc);
5538   // Blend the result with zero vector using permute mask, each column entry
5539   // in a permute table row contains either a valid permute index or a -1 (default)
5540   // value, this can potentially be used as a blending mask after
5541   // compressing/expanding the source vector lanes.
5542   vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
5543 }
5544 #endif
5545 
5546 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5547                                                bool merge, BasicType bt, int vec_enc) {
5548   if (opcode == Op_CompressV) {
5549     switch(bt) {
5550     case T_BYTE:
5551       evpcompressb(dst, mask, src, merge, vec_enc);
5552       break;
5553     case T_CHAR:
5554     case T_SHORT:
5555       evpcompressw(dst, mask, src, merge, vec_enc);
5556       break;
5557     case T_INT:
5558       evpcompressd(dst, mask, src, merge, vec_enc);
5559       break;
5560     case T_FLOAT:
5561       evcompressps(dst, mask, src, merge, vec_enc);
5562       break;
5563     case T_LONG:
5564       evpcompressq(dst, mask, src, merge, vec_enc);
5565       break;
5566     case T_DOUBLE:
5567       evcompresspd(dst, mask, src, merge, vec_enc);
5568       break;
5569     default:
5570       fatal("Unsupported type %s", type2name(bt));
5571       break;
5572     }
5573   } else {
5574     assert(opcode == Op_ExpandV, "");
5575     switch(bt) {
5576     case T_BYTE:
5577       evpexpandb(dst, mask, src, merge, vec_enc);
5578       break;
5579     case T_CHAR:
5580     case T_SHORT:
5581       evpexpandw(dst, mask, src, merge, vec_enc);
5582       break;
5583     case T_INT:
5584       evpexpandd(dst, mask, src, merge, vec_enc);
5585       break;
5586     case T_FLOAT:
5587       evexpandps(dst, mask, src, merge, vec_enc);
5588       break;
5589     case T_LONG:
5590       evpexpandq(dst, mask, src, merge, vec_enc);
5591       break;
5592     case T_DOUBLE:
5593       evexpandpd(dst, mask, src, merge, vec_enc);
5594       break;
5595     default:
5596       fatal("Unsupported type %s", type2name(bt));
5597       break;
5598     }
5599   }
5600 }
5601 #endif
5602 
5603 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5604                                            KRegister ktmp1, int vec_enc) {
5605   if (opcode == Op_SignumVD) {
5606     vsubpd(dst, zero, one, vec_enc);
5607     // if src < 0 ? -1 : 1
5608     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5609     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5610     // if src == NaN, -0.0 or 0.0 return src.
5611     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5612     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5613   } else {
5614     assert(opcode == Op_SignumVF, "");
5615     vsubps(dst, zero, one, vec_enc);
5616     // if src < 0 ? -1 : 1
5617     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5618     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5619     // if src == NaN, -0.0 or 0.0 return src.
5620     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5621     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5622   }
5623 }
5624 
5625 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5626                                           XMMRegister xtmp1, int vec_enc) {
5627   if (opcode == Op_SignumVD) {
5628     vsubpd(dst, zero, one, vec_enc);
5629     // if src < 0 ? -1 : 1
5630     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5631     // if src == NaN, -0.0 or 0.0 return src.
5632     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5633     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5634   } else {
5635     assert(opcode == Op_SignumVF, "");
5636     vsubps(dst, zero, one, vec_enc);
5637     // if src < 0 ? -1 : 1
5638     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5639     // if src == NaN, -0.0 or 0.0 return src.
5640     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5641     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5642   }
5643 }
5644 
5645 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5646   if (VM_Version::supports_avx512bw()) {
5647     if (mask_len > 32) {
5648       kmovql(dst, src);
5649     } else {
5650       kmovdl(dst, src);
5651       if (mask_len != 32) {
5652         kshiftrdl(dst, dst, 32 - mask_len);
5653       }
5654     }
5655   } else {
5656     assert(mask_len <= 16, "");
5657     kmovwl(dst, src);
5658     if (mask_len != 16) {
5659       kshiftrwl(dst, dst, 16 - mask_len);
5660     }
5661   }
5662 }
5663 
5664 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5665   int lane_size = type2aelembytes(bt);
5666   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5667   if ((is_LP64 || lane_size < 8) &&
5668       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5669        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5670     movptr(rtmp, imm32);
5671     switch(lane_size) {
5672       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5673       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5674       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5675       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5676       fatal("Unsupported lane size %d", lane_size);
5677       break;
5678     }
5679   } else {
5680     movptr(rtmp, imm32);
5681     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5682     switch(lane_size) {
5683       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5684       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5685       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5686       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5687       fatal("Unsupported lane size %d", lane_size);
5688       break;
5689     }
5690   }
5691 }
5692 
5693 //
5694 // Following is lookup table based popcount computation algorithm:-
5695 //       Index   Bit set count
5696 //     [ 0000 ->   0,
5697 //       0001 ->   1,
5698 //       0010 ->   1,
5699 //       0011 ->   2,
5700 //       0100 ->   1,
5701 //       0101 ->   2,
5702 //       0110 ->   2,
5703 //       0111 ->   3,
5704 //       1000 ->   1,
5705 //       1001 ->   2,
5706 //       1010 ->   3,
5707 //       1011 ->   3,
5708 //       1100 ->   2,
5709 //       1101 ->   3,
5710 //       1111 ->   4 ]
5711 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5712 //     shuffle indices for lookup table access.
5713 //  b. Right shift each byte of vector lane by 4 positions.
5714 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5715 //     shuffle indices for lookup table access.
5716 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5717 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5718 //     count of all the bytes of a quadword.
5719 //  f. Perform step e. for upper 128bit vector lane.
5720 //  g. Pack the bitset count of quadwords back to double word.
5721 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5722 
5723 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5724                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5725   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5726   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5727   vpsrlw(dst, src, 4, vec_enc);
5728   vpand(dst, dst, xtmp1, vec_enc);
5729   vpand(xtmp1, src, xtmp1, vec_enc);
5730   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5731   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5732   vpshufb(dst, xtmp2, dst, vec_enc);
5733   vpaddb(dst, dst, xtmp1, vec_enc);
5734 }
5735 
5736 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5737                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5738   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5739   // Following code is as per steps e,f,g and h of above algorithm.
5740   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5741   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5742   vpsadbw(dst, dst, xtmp2, vec_enc);
5743   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5744   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5745   vpackuswb(dst, xtmp1, dst, vec_enc);
5746 }
5747 
5748 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5749                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5750   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5751   // Add the popcount of upper and lower bytes of word.
5752   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5753   vpsrlw(dst, xtmp1, 8, vec_enc);
5754   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5755   vpaddw(dst, dst, xtmp1, vec_enc);
5756 }
5757 
5758 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5759                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5760   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5761   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5762   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5763 }
5764 
5765 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5766                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5767   switch(bt) {
5768     case T_LONG:
5769       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5770       break;
5771     case T_INT:
5772       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5773       break;
5774     case T_CHAR:
5775     case T_SHORT:
5776       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5777       break;
5778     case T_BYTE:
5779     case T_BOOLEAN:
5780       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5781       break;
5782     default:
5783       fatal("Unsupported type %s", type2name(bt));
5784       break;
5785   }
5786 }
5787 
5788 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5789                                                       KRegister mask, bool merge, int vec_enc) {
5790   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5791   switch(bt) {
5792     case T_LONG:
5793       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5794       evpopcntq(dst, mask, src, merge, vec_enc);
5795       break;
5796     case T_INT:
5797       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5798       evpopcntd(dst, mask, src, merge, vec_enc);
5799       break;
5800     case T_CHAR:
5801     case T_SHORT:
5802       assert(VM_Version::supports_avx512_bitalg(), "");
5803       evpopcntw(dst, mask, src, merge, vec_enc);
5804       break;
5805     case T_BYTE:
5806     case T_BOOLEAN:
5807       assert(VM_Version::supports_avx512_bitalg(), "");
5808       evpopcntb(dst, mask, src, merge, vec_enc);
5809       break;
5810     default:
5811       fatal("Unsupported type %s", type2name(bt));
5812       break;
5813   }
5814 }
5815 
5816 #ifndef _LP64
5817 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
5818   assert(VM_Version::supports_avx512bw(), "");
5819   kmovdl(tmp, src);
5820   kunpckdql(dst, tmp, tmp);
5821 }
5822 #endif
5823 
5824 // Bit reversal algorithm first reverses the bits of each byte followed by
5825 // a byte level reversal for multi-byte primitive types (short/int/long).
5826 // Algorithm performs a lookup table access to get reverse bit sequence
5827 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5828 // is obtained by swapping the reverse bit sequences of upper and lower
5829 // nibble of a byte.
5830 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5831                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5832   if (VM_Version::supports_avx512vlbw()) {
5833 
5834     // Get the reverse bit sequence of lower nibble of each byte.
5835     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5836     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5837     evpandq(dst, xtmp2, src, vec_enc);
5838     vpshufb(dst, xtmp1, dst, vec_enc);
5839     vpsllq(dst, dst, 4, vec_enc);
5840 
5841     // Get the reverse bit sequence of upper nibble of each byte.
5842     vpandn(xtmp2, xtmp2, src, vec_enc);
5843     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5844     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5845 
5846     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5847     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5848     evporq(xtmp2, dst, xtmp2, vec_enc);
5849     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5850 
5851   } else if(vec_enc == Assembler::AVX_512bit) {
5852     // Shift based bit reversal.
5853     assert(bt == T_LONG || bt == T_INT, "");
5854 
5855     // Swap lower and upper nibble of each byte.
5856     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5857 
5858     // Swap two least and most significant bits of each nibble.
5859     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5860 
5861     // Swap adjacent pair of bits.
5862     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5863     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5864 
5865     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5866     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5867   } else {
5868     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5869     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5870 
5871     // Get the reverse bit sequence of lower nibble of each byte.
5872     vpand(dst, xtmp2, src, vec_enc);
5873     vpshufb(dst, xtmp1, dst, vec_enc);
5874     vpsllq(dst, dst, 4, vec_enc);
5875 
5876     // Get the reverse bit sequence of upper nibble of each byte.
5877     vpandn(xtmp2, xtmp2, src, vec_enc);
5878     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5879     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5880 
5881     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5882     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5883     vpor(xtmp2, dst, xtmp2, vec_enc);
5884     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5885   }
5886 }
5887 
5888 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5889                                                 XMMRegister xtmp, Register rscratch) {
5890   assert(VM_Version::supports_gfni(), "");
5891   assert(rscratch != noreg || always_reachable(mask), "missing");
5892 
5893   // Galois field instruction based bit reversal based on following algorithm.
5894   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5895   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5896   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5897   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5898 }
5899 
5900 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5901                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5902   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5903   evpandq(dst, xtmp1, src, vec_enc);
5904   vpsllq(dst, dst, nbits, vec_enc);
5905   vpandn(xtmp1, xtmp1, src, vec_enc);
5906   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5907   evporq(dst, dst, xtmp1, vec_enc);
5908 }
5909 
5910 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5911                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5912   // Shift based bit reversal.
5913   assert(VM_Version::supports_evex(), "");
5914   switch(bt) {
5915     case T_LONG:
5916       // Swap upper and lower double word of each quad word.
5917       evprorq(xtmp1, k0, src, 32, true, vec_enc);
5918       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5919       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5920       break;
5921     case T_INT:
5922       // Swap upper and lower word of each double word.
5923       evprord(xtmp1, k0, src, 16, true, vec_enc);
5924       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5925       break;
5926     case T_CHAR:
5927     case T_SHORT:
5928       // Swap upper and lower byte of each word.
5929       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
5930       break;
5931     case T_BYTE:
5932       evmovdquq(dst, k0, src, true, vec_enc);
5933       break;
5934     default:
5935       fatal("Unsupported type %s", type2name(bt));
5936       break;
5937   }
5938 }
5939 
5940 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5941   if (bt == T_BYTE) {
5942     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
5943       evmovdquq(dst, k0, src, true, vec_enc);
5944     } else {
5945       vmovdqu(dst, src);
5946     }
5947     return;
5948   }
5949   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
5950   // pre-computed shuffle indices.
5951   switch(bt) {
5952     case T_LONG:
5953       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
5954       break;
5955     case T_INT:
5956       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
5957       break;
5958     case T_CHAR:
5959     case T_SHORT:
5960       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
5961       break;
5962     default:
5963       fatal("Unsupported type %s", type2name(bt));
5964       break;
5965   }
5966   vpshufb(dst, src, dst, vec_enc);
5967 }
5968 
5969 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5970                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5971                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
5972   assert(is_integral_type(bt), "");
5973   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5974   assert(VM_Version::supports_avx512cd(), "");
5975   switch(bt) {
5976     case T_LONG:
5977       evplzcntq(dst, ktmp, src, merge, vec_enc);
5978       break;
5979     case T_INT:
5980       evplzcntd(dst, ktmp, src, merge, vec_enc);
5981       break;
5982     case T_SHORT:
5983       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
5984       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
5985       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
5986       vpunpckhwd(dst, xtmp1, src, vec_enc);
5987       evplzcntd(dst, ktmp, dst, merge, vec_enc);
5988       vpackusdw(dst, xtmp2, dst, vec_enc);
5989       break;
5990     case T_BYTE:
5991       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5992       // accessing the lookup table.
5993       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5994       // accessing the lookup table.
5995       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5996       assert(VM_Version::supports_avx512bw(), "");
5997       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
5998       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
5999       vpand(xtmp2, dst, src, vec_enc);
6000       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6001       vpsrlw(xtmp3, src, 4, vec_enc);
6002       vpand(xtmp3, dst, xtmp3, vec_enc);
6003       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6004       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6005       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6006       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6007       break;
6008     default:
6009       fatal("Unsupported type %s", type2name(bt));
6010       break;
6011   }
6012 }
6013 
6014 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6015                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6016   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6017   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6018   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6019   // accessing the lookup table.
6020   vpand(dst, xtmp2, src, vec_enc);
6021   vpshufb(dst, xtmp1, dst, vec_enc);
6022   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6023   // accessing the lookup table.
6024   vpsrlw(xtmp3, src, 4, vec_enc);
6025   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6026   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6027   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6028   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6029   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6030   vpaddb(dst, dst, xtmp2, vec_enc);
6031   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6032 }
6033 
6034 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6035                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6036   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6037   // Add zero counts of lower byte and upper byte of a word if
6038   // upper byte holds a zero value.
6039   vpsrlw(xtmp3, src, 8, vec_enc);
6040   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6041   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6042   vpsllw(xtmp2, dst, 8, vec_enc);
6043   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6044   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6045   vpsrlw(dst, dst, 8, vec_enc);
6046 }
6047 
6048 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6049                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6050   // Since IEEE 754 floating point format represents mantissa in 1.0 format
6051   // hence biased exponent can be used to compute leading zero count as per
6052   // following formula:-
6053   // LZCNT = 32 - (biased_exp - 127)
6054   // Special handling has been introduced for Zero, Max_Int and -ve source values.
6055 
6056   // Broadcast 0xFF
6057   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6058   vpsrld(xtmp1, xtmp1, 24, vec_enc);
6059 
6060   // Extract biased exponent.
6061   vcvtdq2ps(dst, src, vec_enc);
6062   vpsrld(dst, dst, 23, vec_enc);
6063   vpand(dst, dst, xtmp1, vec_enc);
6064 
6065   // Broadcast 127.
6066   vpsrld(xtmp1, xtmp1, 1, vec_enc);
6067   // Exponent = biased_exp - 127
6068   vpsubd(dst, dst, xtmp1, vec_enc);
6069 
6070   // Exponent = Exponent  + 1
6071   vpsrld(xtmp3, xtmp1, 6, vec_enc);
6072   vpaddd(dst, dst, xtmp3, vec_enc);
6073 
6074   // Replace -ve exponent with zero, exponent is -ve when src
6075   // lane contains a zero value.
6076   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6077   vblendvps(dst, dst, xtmp2, dst, vec_enc);
6078 
6079   // Rematerialize broadcast 32.
6080   vpslld(xtmp1, xtmp3, 5, vec_enc);
6081   // Exponent is 32 if corresponding source lane contains max_int value.
6082   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
6083   // LZCNT = 32 - exponent
6084   vpsubd(dst, xtmp1, dst, vec_enc);
6085 
6086   // Replace LZCNT with a value 1 if corresponding source lane
6087   // contains max_int value.
6088   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
6089 
6090   // Replace biased_exp with 0 if source lane value is less than zero.
6091   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6092   vblendvps(dst, dst, xtmp2, src, vec_enc);
6093 }
6094 
6095 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6096                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6097   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6098   // Add zero counts of lower word and upper word of a double word if
6099   // upper word holds a zero value.
6100   vpsrld(xtmp3, src, 16, vec_enc);
6101   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6102   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6103   vpslld(xtmp2, dst, 16, vec_enc);
6104   vpaddd(xtmp2, xtmp2, dst, vec_enc);
6105   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6106   vpsrld(dst, dst, 16, vec_enc);
6107   // Add zero counts of lower doubleword and upper doubleword of a
6108   // quadword if upper doubleword holds a zero value.
6109   vpsrlq(xtmp3, src, 32, vec_enc);
6110   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6111   vpsllq(xtmp2, dst, 32, vec_enc);
6112   vpaddq(xtmp2, xtmp2, dst, vec_enc);
6113   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6114   vpsrlq(dst, dst, 32, vec_enc);
6115 }
6116 
6117 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6118                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6119                                                        Register rtmp, int vec_enc) {
6120   assert(is_integral_type(bt), "unexpected type");
6121   assert(vec_enc < Assembler::AVX_512bit, "");
6122   switch(bt) {
6123     case T_LONG:
6124       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6125       break;
6126     case T_INT:
6127       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6128       break;
6129     case T_SHORT:
6130       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6131       break;
6132     case T_BYTE:
6133       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6134       break;
6135     default:
6136       fatal("Unsupported type %s", type2name(bt));
6137       break;
6138   }
6139 }
6140 
6141 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6142   switch(bt) {
6143     case T_BYTE:
6144       vpsubb(dst, src1, src2, vec_enc);
6145       break;
6146     case T_SHORT:
6147       vpsubw(dst, src1, src2, vec_enc);
6148       break;
6149     case T_INT:
6150       vpsubd(dst, src1, src2, vec_enc);
6151       break;
6152     case T_LONG:
6153       vpsubq(dst, src1, src2, vec_enc);
6154       break;
6155     default:
6156       fatal("Unsupported type %s", type2name(bt));
6157       break;
6158   }
6159 }
6160 
6161 // Trailing zero count computation is based on leading zero count operation as per
6162 // following equation. All AVX3 targets support AVX512CD feature which offers
6163 // direct vector instruction to compute leading zero count.
6164 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6165 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6166                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6167                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6168   assert(is_integral_type(bt), "");
6169   // xtmp = -1
6170   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6171   // xtmp = xtmp + src
6172   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6173   // xtmp = xtmp & ~src
6174   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6175   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6176   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6177   vpsub(bt, dst, xtmp4, dst, vec_enc);
6178 }
6179 
6180 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6181 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6182 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6183                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6184   assert(is_integral_type(bt), "");
6185   // xtmp = 0
6186   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6187   // xtmp = 0 - src
6188   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6189   // xtmp = xtmp | src
6190   vpor(xtmp3, xtmp3, src, vec_enc);
6191   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6192   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6193   vpsub(bt, dst, xtmp1, dst, vec_enc);
6194 }
6195 
6196 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6197   Label done;
6198   Label neg_divisor_fastpath;
6199   cmpl(divisor, 0);
6200   jccb(Assembler::less, neg_divisor_fastpath);
6201   xorl(rdx, rdx);
6202   divl(divisor);
6203   jmpb(done);
6204   bind(neg_divisor_fastpath);
6205   // Fastpath for divisor < 0:
6206   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6207   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6208   movl(rdx, rax);
6209   subl(rdx, divisor);
6210   if (VM_Version::supports_bmi1()) {
6211     andnl(rax, rdx, rax);
6212   } else {
6213     notl(rdx);
6214     andl(rax, rdx);
6215   }
6216   shrl(rax, 31);
6217   bind(done);
6218 }
6219 
6220 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6221   Label done;
6222   Label neg_divisor_fastpath;
6223   cmpl(divisor, 0);
6224   jccb(Assembler::less, neg_divisor_fastpath);
6225   xorl(rdx, rdx);
6226   divl(divisor);
6227   jmpb(done);
6228   bind(neg_divisor_fastpath);
6229   // Fastpath when divisor < 0:
6230   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6231   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6232   movl(rdx, rax);
6233   subl(rax, divisor);
6234   if (VM_Version::supports_bmi1()) {
6235     andnl(rax, rax, rdx);
6236   } else {
6237     notl(rax);
6238     andl(rax, rdx);
6239   }
6240   sarl(rax, 31);
6241   andl(rax, divisor);
6242   subl(rdx, rax);
6243   bind(done);
6244 }
6245 
6246 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6247   Label done;
6248   Label neg_divisor_fastpath;
6249 
6250   cmpl(divisor, 0);
6251   jccb(Assembler::less, neg_divisor_fastpath);
6252   xorl(rdx, rdx);
6253   divl(divisor);
6254   jmpb(done);
6255   bind(neg_divisor_fastpath);
6256   // Fastpath for divisor < 0:
6257   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6258   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6259   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6260   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6261   movl(rdx, rax);
6262   subl(rax, divisor);
6263   if (VM_Version::supports_bmi1()) {
6264     andnl(rax, rax, rdx);
6265   } else {
6266     notl(rax);
6267     andl(rax, rdx);
6268   }
6269   movl(tmp, rax);
6270   shrl(rax, 31); // quotient
6271   sarl(tmp, 31);
6272   andl(tmp, divisor);
6273   subl(rdx, tmp); // remainder
6274   bind(done);
6275 }
6276 
6277 #ifdef _LP64
6278 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6279                                  XMMRegister xtmp2, Register rtmp) {
6280   if(VM_Version::supports_gfni()) {
6281     // Galois field instruction based bit reversal based on following algorithm.
6282     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6283     mov64(rtmp, 0x8040201008040201L);
6284     movq(xtmp1, src);
6285     movq(xtmp2, rtmp);
6286     gf2p8affineqb(xtmp1, xtmp2, 0);
6287     movq(dst, xtmp1);
6288   } else {
6289     // Swap even and odd numbered bits.
6290     movl(rtmp, src);
6291     andl(rtmp, 0x55555555);
6292     shll(rtmp, 1);
6293     movl(dst, src);
6294     andl(dst, 0xAAAAAAAA);
6295     shrl(dst, 1);
6296     orl(dst, rtmp);
6297 
6298     // Swap LSB and MSB 2 bits of each nibble.
6299     movl(rtmp, dst);
6300     andl(rtmp, 0x33333333);
6301     shll(rtmp, 2);
6302     andl(dst, 0xCCCCCCCC);
6303     shrl(dst, 2);
6304     orl(dst, rtmp);
6305 
6306     // Swap LSB and MSB 4 bits of each byte.
6307     movl(rtmp, dst);
6308     andl(rtmp, 0x0F0F0F0F);
6309     shll(rtmp, 4);
6310     andl(dst, 0xF0F0F0F0);
6311     shrl(dst, 4);
6312     orl(dst, rtmp);
6313   }
6314   bswapl(dst);
6315 }
6316 
6317 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6318                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6319   if(VM_Version::supports_gfni()) {
6320     // Galois field instruction based bit reversal based on following algorithm.
6321     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6322     mov64(rtmp1, 0x8040201008040201L);
6323     movq(xtmp1, src);
6324     movq(xtmp2, rtmp1);
6325     gf2p8affineqb(xtmp1, xtmp2, 0);
6326     movq(dst, xtmp1);
6327   } else {
6328     // Swap even and odd numbered bits.
6329     movq(rtmp1, src);
6330     mov64(rtmp2, 0x5555555555555555L);
6331     andq(rtmp1, rtmp2);
6332     shlq(rtmp1, 1);
6333     movq(dst, src);
6334     notq(rtmp2);
6335     andq(dst, rtmp2);
6336     shrq(dst, 1);
6337     orq(dst, rtmp1);
6338 
6339     // Swap LSB and MSB 2 bits of each nibble.
6340     movq(rtmp1, dst);
6341     mov64(rtmp2, 0x3333333333333333L);
6342     andq(rtmp1, rtmp2);
6343     shlq(rtmp1, 2);
6344     notq(rtmp2);
6345     andq(dst, rtmp2);
6346     shrq(dst, 2);
6347     orq(dst, rtmp1);
6348 
6349     // Swap LSB and MSB 4 bits of each byte.
6350     movq(rtmp1, dst);
6351     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6352     andq(rtmp1, rtmp2);
6353     shlq(rtmp1, 4);
6354     notq(rtmp2);
6355     andq(dst, rtmp2);
6356     shrq(dst, 4);
6357     orq(dst, rtmp1);
6358   }
6359   bswapq(dst);
6360 }
6361 
6362 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6363   Label done;
6364   Label neg_divisor_fastpath;
6365   cmpq(divisor, 0);
6366   jccb(Assembler::less, neg_divisor_fastpath);
6367   xorl(rdx, rdx);
6368   divq(divisor);
6369   jmpb(done);
6370   bind(neg_divisor_fastpath);
6371   // Fastpath for divisor < 0:
6372   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6373   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6374   movq(rdx, rax);
6375   subq(rdx, divisor);
6376   if (VM_Version::supports_bmi1()) {
6377     andnq(rax, rdx, rax);
6378   } else {
6379     notq(rdx);
6380     andq(rax, rdx);
6381   }
6382   shrq(rax, 63);
6383   bind(done);
6384 }
6385 
6386 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6387   Label done;
6388   Label neg_divisor_fastpath;
6389   cmpq(divisor, 0);
6390   jccb(Assembler::less, neg_divisor_fastpath);
6391   xorq(rdx, rdx);
6392   divq(divisor);
6393   jmp(done);
6394   bind(neg_divisor_fastpath);
6395   // Fastpath when divisor < 0:
6396   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6397   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6398   movq(rdx, rax);
6399   subq(rax, divisor);
6400   if (VM_Version::supports_bmi1()) {
6401     andnq(rax, rax, rdx);
6402   } else {
6403     notq(rax);
6404     andq(rax, rdx);
6405   }
6406   sarq(rax, 63);
6407   andq(rax, divisor);
6408   subq(rdx, rax);
6409   bind(done);
6410 }
6411 
6412 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6413   Label done;
6414   Label neg_divisor_fastpath;
6415   cmpq(divisor, 0);
6416   jccb(Assembler::less, neg_divisor_fastpath);
6417   xorq(rdx, rdx);
6418   divq(divisor);
6419   jmp(done);
6420   bind(neg_divisor_fastpath);
6421   // Fastpath for divisor < 0:
6422   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6423   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6424   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6425   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6426   movq(rdx, rax);
6427   subq(rax, divisor);
6428   if (VM_Version::supports_bmi1()) {
6429     andnq(rax, rax, rdx);
6430   } else {
6431     notq(rax);
6432     andq(rax, rdx);
6433   }
6434   movq(tmp, rax);
6435   shrq(rax, 63); // quotient
6436   sarq(tmp, 63);
6437   andq(tmp, divisor);
6438   subq(rdx, tmp); // remainder
6439   bind(done);
6440 }
6441 #endif
6442 
6443 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6444                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6445                                         int vlen_enc) {
6446   assert(VM_Version::supports_avx512bw(), "");
6447   // Byte shuffles are inlane operations and indices are determined using
6448   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6449   // normalized to index range 0-15. This makes sure that all the multiples
6450   // of an index value are placed at same relative position in 128 bit
6451   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6452   // will be 16th element in their respective 128 bit lanes.
6453   movl(rtmp, 16);
6454   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6455 
6456   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6457   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6458   // original shuffle indices and move the shuffled lanes corresponding to true
6459   // mask to destination vector.
6460   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6461   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6462   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6463 
6464   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6465   // and broadcasting second 128 bit lane.
6466   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6467   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6468   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6469   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6470   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6471 
6472   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6473   // and broadcasting third 128 bit lane.
6474   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6475   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6476   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6477   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6478   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6479 
6480   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6481   // and broadcasting third 128 bit lane.
6482   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6483   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6484   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6485   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6486   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6487 }
6488 
6489 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6490                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6491   if (vlen_enc == AVX_128bit) {
6492     vpermilps(dst, src, shuffle, vlen_enc);
6493   } else if (bt == T_INT) {
6494     vpermd(dst, shuffle, src, vlen_enc);
6495   } else {
6496     assert(bt == T_FLOAT, "");
6497     vpermps(dst, shuffle, src, vlen_enc);
6498   }
6499 }